49class X86InterleavedAccessGroup {
61 const unsigned Factor;
92 unsigned NumSubVecElems);
97 unsigned NumSubVecElems);
100 unsigned NumSubVecElems);
114 : Inst(
I), Shuffles(Shuffs), Indices(Ind), Factor(
F), Subtarget(STarget),
115 DL(Inst->getModule()->getDataLayout()),
Builder(
B) {}
119 bool isSupported()
const;
123 bool lowerIntoOptimizedSequence();
128bool X86InterleavedAccessGroup::isSupported()
const {
129 VectorType *ShuffleVecTy = Shuffles[0]->getType();
130 Type *ShuffleEltTy = ShuffleVecTy->getElementType();
131 unsigned ShuffleElemSize =
DL.getTypeSizeInBits(ShuffleEltTy);
132 unsigned WideInstSize;
140 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
143 if (isa<LoadInst>(Inst)) {
144 WideInstSize =
DL.getTypeSizeInBits(Inst->getType());
145 if (cast<LoadInst>(Inst)->getPointerAddressSpace())
148 WideInstSize =
DL.getTypeSizeInBits(Shuffles[0]->
getType());
152 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
155 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157 WideInstSize == 2048))
160 if (ShuffleElemSize == 8 && Factor == 3 &&
161 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
167void X86InterleavedAccessGroup::decompose(
170 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171 "Expected Load or Shuffle");
176 DL.getTypeSizeInBits(VecWidth) >=
177 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178 "Invalid Inst-size!!!");
180 if (
auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181 Value *Op0 = SVI->getOperand(0);
182 Value *Op1 = SVI->getOperand(1);
185 for (
unsigned i = 0; i < NumSubVectors; ++i)
187 cast<ShuffleVectorInst>(
Builder.CreateShuffleVector(
195 LoadInst *LI = cast<LoadInst>(VecInst);
196 Type *VecBaseTy, *VecBasePtrTy;
198 unsigned int NumLoads = NumSubVectors;
202 unsigned VecLength =
DL.getTypeSizeInBits(VecWidth);
203 if (VecLength == 768 || VecLength == 1536) {
207 NumLoads = NumSubVectors * (VecLength / 384);
209 VecBaseTy = SubVecTy;
215 "VecBaseTy's size must be a multiple of 8");
219 Align Alignment = FirstAlignment;
220 for (
unsigned i = 0; i < NumLoads; i++) {
225 Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
227 Alignment = SubsequentAlignment;
240 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
241 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
242 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
243 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
265 "This function doesn't accept width smaller then 256");
293 unsigned VecElems,
unsigned Stride,
296 if (VecElems == 16) {
297 for (
unsigned i = 0; i < Stride; i++)
298 TransposedMatrix[i] =
Builder.CreateShuffleVector(Vec[i], VPShuf);
305 for (
unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
307 (i + 1) / Stride * 16);
308 Temp[i / 2] =
Builder.CreateShuffleVector(
309 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
310 OptimizeShuf.
clear();
313 if (VecElems == 32) {
314 std::copy(Temp, Temp + Stride, TransposedMatrix.
begin());
317 for (
unsigned i = 0; i < Stride; i++)
318 TransposedMatrix[i] =
319 Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1],
Concat);
322void X86InterleavedAccessGroup::interleave8bitStride4VF8(
332 TransposedMatrix.
resize(2);
337 for (
unsigned i = 0; i < 8; ++i) {
356 TransposedMatrix[0] =
357 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
358 TransposedMatrix[1] =
359 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
362void X86InterleavedAccessGroup::interleave8bitStride4(
374 TransposedMatrix.
resize(4);
412 for (
int i = 0; i < 4; i++)
413 VecOut[i] =
Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
422 std::copy(VecOut, VecOut + 4, TransposedMatrix.
begin());
444 int LaneCount = std::max(VectorSize / 128, 1);
445 for (
int Lane = 0; Lane < LaneCount; Lane++)
446 for (
int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
447 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
457 for (
int i = 0, FirstGroupElement = 0; i < 3; i++) {
458 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
460 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
479 bool AlignDirection =
true,
bool Unary =
false) {
481 unsigned NumLanes = std::max((
int)VT.
getSizeInBits() / 128, 1);
482 unsigned NumLaneElts = NumElts / NumLanes;
484 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
487 for (
unsigned l = 0; l != NumElts; l += NumLaneElts) {
488 for (
unsigned i = 0; i != NumLaneElts; ++i) {
492 if (
Base >= NumLaneElts)
493 Base = Unary ?
Base % NumLaneElts :
Base + NumElts - NumLaneElts;
528 if (VecElems == 16) {
529 for (
int i = 0; i < 3; i++)
534 for (
unsigned j = 0; j < VecElems / 32; j++)
535 for (
int i = 0; i < 3; i++)
536 Vec[i + j * 3] =
Builder.CreateShuffleVector(
542 for (
int i = 0; i < 3; i++)
543 Vec[i] =
Builder.CreateShuffleVector(Vec[i], Vec[i + 3],
Concat);
546void X86InterleavedAccessGroup::deinterleave8bitStride3(
554 TransposedMatrix.
resize(3);
560 Value *Vec[6], *TempVector[3];
567 for (
int i = 0; i < 2; i++)
578 for (
int i = 0; i < 3; i++)
579 Vec[i] =
Builder.CreateShuffleVector(Vec[i], VPShuf);
585 for (
int i = 0; i < 3; i++)
587 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
593 for (
int i = 0; i < 3; i++)
594 Vec[i] =
Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
601 Value *TempVec =
Builder.CreateShuffleVector(Vec[1], VPAlign3);
602 TransposedMatrix[0] =
Builder.CreateShuffleVector(Vec[0], VPAlign2);
603 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
604 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
612 int IndexGroup[3] = {0, 0, 0};
617 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
618 for (
int i = 0; i < 3; i++) {
619 IndexGroup[(
Index * 3) % (VF / Lane)] =
Index;
623 for (
int i = 0; i < VF / Lane; i++) {
629void X86InterleavedAccessGroup::interleave8bitStride3(
637 TransposedMatrix.
resize(3);
644 Value *Vec[3], *TempVector[3];
649 for (
int i = 0; i < 3; i++)
659 Vec[0] =
Builder.CreateShuffleVector(InVec[0], VPAlign2);
660 Vec[1] =
Builder.CreateShuffleVector(InVec[1], VPAlign3);
667 for (
int i = 0; i < 3; i++)
669 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
675 for (
int i = 0; i < 3; i++)
676 Vec[i] =
Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
688void X86InterleavedAccessGroup::transpose_4x4(
692 TransposedMatrix.
resize(4);
695 static constexpr int IntMask1[] = {0, 1, 4, 5};
701 static constexpr int IntMask2[] = {2, 3, 6, 7};
707 static constexpr int IntMask3[] = {0, 4, 2, 6};
709 TransposedMatrix[0] =
Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
710 TransposedMatrix[2] =
Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
713 static constexpr int IntMask4[] = {1, 5, 3, 7};
715 TransposedMatrix[1] =
Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
716 TransposedMatrix[3] =
Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
721bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
724 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->
getType());
726 if (isa<LoadInst>(Inst)) {
727 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
728 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
729 switch (NumSubVecElems) {
737 if (ShuffleTy->getNumElements() != NumSubVecElems)
743 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
749 if (NumSubVecElems == 4)
750 transpose_4x4(DecomposedVectors, TransposedVectors);
752 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
757 for (
unsigned i = 0, e = Shuffles.size(); i < e; ++i)
758 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
763 Type *ShuffleEltTy = ShuffleTy->getElementType();
764 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
775 switch (NumSubVecElems) {
777 transpose_4x4(DecomposedVectors, TransposedVectors);
780 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
786 interleave8bitStride4(DecomposedVectors, TransposedVectors,
789 interleave8bitStride3(DecomposedVectors, TransposedVectors,
801 Builder.CreateAlignedStore(WideVec,
SI->getPointerOperand(),
SI->getAlign());
814 "Invalid interleave factor");
815 assert(!Shuffles.
empty() &&
"Empty shufflevector input");
817 "Unmatched number of shufflevectors and indices");
821 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
824 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
829 unsigned Factor)
const {
831 "Invalid interleave factor");
833 assert(cast<FixedVectorType>(SVI->
getType())->getNumElements() % Factor ==
835 "Invalid interleaved store");
841 for (
unsigned i = 0; i < Factor; i++)
848 X86InterleavedAccessGroup Grp(
SI, Shuffles, Indices, Factor, Subtarget,
851 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static Decomposition decompose(Value *V, SmallVectorImpl< PreconditionTy > &Preconditions, bool IsSigned, const DataLayout &DL)
Module.h This file contains the declarations for the Module class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
static void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool AlignDirection=true, bool Unary=false)
static void genShuffleBland(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &Out, int LowOffset, int HighOffset)
static MVT scaleVectorType(MVT VT)
static constexpr int Concat[]
static void group2Shuffle(MVT VT, SmallVectorImpl< int > &Mask, SmallVectorImpl< int > &Output)
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< int > &Mask)
static void concatSubVector(Value **Vec, ArrayRef< Instruction * > InVec, unsigned VecElems, IRBuilder<> &Builder)
static void setGroupSize(MVT VT, SmallVectorImpl< int > &SizeInfo)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value * > &TransposedMatrix, ArrayRef< Value * > Vec, ArrayRef< int > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> &Builder)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
A parsed version of the target data layout string in and methods for querying it.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
static IntegerType * getInt8Ty(LLVMContext &C)
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
constexpr ScalarTy getFixedValue() const
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
This is an optimization pass for GlobalISel generic memory operations.
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
This struct is a compact representation of a valid (non-zero power of two) alignment.