49class X86InterleavedAccessGroup {
61 const unsigned Factor;
92 unsigned NumSubVecElems);
97 unsigned NumSubVecElems);
100 unsigned NumSubVecElems);
114 : Inst(
I), Shuffles(Shuffs), Indices(Ind), Factor(
F), Subtarget(STarget),
115 DL(Inst->getDataLayout()), Builder(
B) {}
119 bool isSupported()
const;
123 bool lowerIntoOptimizedSequence();
128bool X86InterleavedAccessGroup::isSupported()
const {
129 VectorType *ShuffleVecTy = Shuffles[0]->getType();
130 Type *ShuffleEltTy = ShuffleVecTy->getElementType();
131 unsigned ShuffleElemSize =
DL.getTypeSizeInBits(ShuffleEltTy);
132 unsigned WideInstSize;
140 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
143 if (isa<LoadInst>(Inst)) {
144 WideInstSize =
DL.getTypeSizeInBits(Inst->getType());
148 WideInstSize =
DL.getTypeSizeInBits(Shuffles[0]->
getType());
152 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
155 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157 WideInstSize == 2048))
160 if (ShuffleElemSize == 8 && Factor == 3 &&
161 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
167void X86InterleavedAccessGroup::decompose(
170 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171 "Expected Load or Shuffle");
176 DL.getTypeSizeInBits(VecWidth) >=
177 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178 "Invalid Inst-size!!!");
180 if (
auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181 Value *Op0 = SVI->getOperand(0);
182 Value *Op1 = SVI->getOperand(1);
185 for (
unsigned i = 0; i < NumSubVectors; ++i)
187 cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
195 LoadInst *LI = cast<LoadInst>(VecInst);
197 unsigned int NumLoads = NumSubVectors;
201 unsigned VecLength =
DL.getTypeSizeInBits(VecWidth);
203 if (VecLength == 768 || VecLength == 1536) {
205 NumLoads = NumSubVectors * (VecLength / 384);
207 VecBaseTy = SubVecTy;
211 "VecBaseTy's size must be a multiple of 8");
215 Align Alignment = FirstAlignment;
216 for (
unsigned i = 0; i < NumLoads; i++) {
219 Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
221 Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
223 Alignment = SubsequentAlignment;
236 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
237 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
238 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
239 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
261 "This function doesn't accept width smaller then 256");
289 unsigned VecElems,
unsigned Stride,
292 if (VecElems == 16) {
293 for (
unsigned i = 0; i < Stride; i++)
301 for (
unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
303 (i + 1) / Stride * 16);
305 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
306 OptimizeShuf.
clear();
309 if (VecElems == 32) {
310 std::copy(Temp, Temp + Stride, TransposedMatrix.
begin());
313 for (
unsigned i = 0; i < Stride; i++)
314 TransposedMatrix[i] =
318void X86InterleavedAccessGroup::interleave8bitStride4VF8(
328 TransposedMatrix.
resize(2);
333 for (
unsigned i = 0; i < 8; ++i) {
345 Builder.CreateShuffleVector(
Matrix[0],
Matrix[1], MaskLow);
347 Builder.CreateShuffleVector(
Matrix[2],
Matrix[3], MaskLow);
352 TransposedMatrix[0] =
353 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
354 TransposedMatrix[1] =
355 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
358void X86InterleavedAccessGroup::interleave8bitStride4(
370 TransposedMatrix.
resize(4);
397 IntrVec[0] = Builder.CreateShuffleVector(
Matrix[0],
Matrix[1], MaskLow);
398 IntrVec[1] = Builder.CreateShuffleVector(
Matrix[0],
Matrix[1], MaskHigh);
399 IntrVec[2] = Builder.CreateShuffleVector(
Matrix[2],
Matrix[3], MaskLow);
400 IntrVec[3] = Builder.CreateShuffleVector(
Matrix[2],
Matrix[3], MaskHigh);
408 for (
int i = 0; i < 4; i++)
409 VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
417 if (VT == MVT::v16i8) {
418 std::copy(VecOut, VecOut + 4, TransposedMatrix.
begin());
440 int LaneCount = std::max(VectorSize / 128, 1);
441 for (
int Lane = 0; Lane < LaneCount; Lane++)
442 for (
int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
443 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
453 for (
int i = 0, FirstGroupElement = 0; i < 3; i++) {
454 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
456 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
475 bool AlignDirection =
true,
bool Unary =
false) {
477 unsigned NumLanes = std::max((
int)VT.
getSizeInBits() / 128, 1);
478 unsigned NumLaneElts = NumElts / NumLanes;
480 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
483 for (
unsigned l = 0; l != NumElts; l += NumLaneElts) {
484 for (
unsigned i = 0; i != NumLaneElts; ++i) {
488 if (
Base >= NumLaneElts)
489 Base = Unary ?
Base % NumLaneElts :
Base + NumElts - NumLaneElts;
524 if (VecElems == 16) {
525 for (
int i = 0; i < 3; i++)
530 for (
unsigned j = 0; j < VecElems / 32; j++)
531 for (
int i = 0; i < 3; i++)
538 for (
int i = 0; i < 3; i++)
542void X86InterleavedAccessGroup::deinterleave8bitStride3(
550 TransposedMatrix.
resize(3);
556 Value *Vec[6], *TempVector[3];
563 for (
int i = 0; i < 2; i++)
574 for (
int i = 0; i < 3; i++)
575 Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
581 for (
int i = 0; i < 3; i++)
583 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
589 for (
int i = 0; i < 3; i++)
590 Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
597 Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
598 TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
599 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
600 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
608 int IndexGroup[3] = {0, 0, 0};
613 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
614 for (
int i = 0; i < 3; i++) {
615 IndexGroup[(
Index * 3) % (VF / Lane)] =
Index;
619 for (
int i = 0; i < VF / Lane; i++) {
625void X86InterleavedAccessGroup::interleave8bitStride3(
633 TransposedMatrix.
resize(3);
640 Value *Vec[3], *TempVector[3];
645 for (
int i = 0; i < 3; i++)
655 Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
656 Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
663 for (
int i = 0; i < 3; i++)
665 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
671 for (
int i = 0; i < 3; i++)
672 Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
684void X86InterleavedAccessGroup::transpose_4x4(
688 TransposedMatrix.
resize(4);
691 static constexpr int IntMask1[] = {0, 1, 4, 5};
697 static constexpr int IntMask2[] = {2, 3, 6, 7};
703 static constexpr int IntMask3[] = {0, 4, 2, 6};
705 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
706 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
709 static constexpr int IntMask4[] = {1, 5, 3, 7};
711 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
712 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
717bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
720 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->
getType());
722 if (isa<LoadInst>(Inst)) {
723 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
724 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
725 switch (NumSubVecElems) {
733 if (ShuffleTy->getNumElements() != NumSubVecElems)
739 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
745 if (NumSubVecElems == 4)
746 transpose_4x4(DecomposedVectors, TransposedVectors);
748 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
753 for (
unsigned i = 0, e = Shuffles.size(); i < e; ++i)
754 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
759 Type *ShuffleEltTy = ShuffleTy->getElementType();
760 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
771 switch (NumSubVecElems) {
773 transpose_4x4(DecomposedVectors, TransposedVectors);
776 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
782 interleave8bitStride4(DecomposedVectors, TransposedVectors,
785 interleave8bitStride3(DecomposedVectors, TransposedVectors,
797 Builder.CreateAlignedStore(WideVec,
SI->getPointerOperand(),
SI->getAlign());
810 "Invalid interleave factor");
811 assert(!Shuffles.
empty() &&
"Empty shufflevector input");
813 "Unmatched number of shufflevectors and indices");
817 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
820 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
825 unsigned Factor)
const {
827 "Invalid interleave factor");
829 assert(cast<FixedVectorType>(SVI->
getType())->getNumElements() % Factor ==
831 "Invalid interleaved store");
837 for (
unsigned i = 0; i < Factor; i++)
844 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
847 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static Decomposition decompose(Value *V, SmallVectorImpl< ConditionTy > &Preconditions, bool IsSigned, const DataLayout &DL)
Module.h This file contains the declarations for the Module class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
static void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool AlignDirection=true, bool Unary=false)
static void genShuffleBland(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &Out, int LowOffset, int HighOffset)
static MVT scaleVectorType(MVT VT)
static constexpr int Concat[]
static void group2Shuffle(MVT VT, SmallVectorImpl< int > &Mask, SmallVectorImpl< int > &Output)
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< int > &Mask)
static void concatSubVector(Value **Vec, ArrayRef< Instruction * > InVec, unsigned VecElems, IRBuilder<> &Builder)
static void setGroupSize(MVT VT, SmallVectorImpl< int > &SizeInfo)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value * > &TransposedMatrix, ArrayRef< Value * > Vec, ArrayRef< int > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> &Builder)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
A parsed version of the target data layout string in and methods for querying it.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
static IntegerType * getInt8Ty(LLVMContext &C)
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
constexpr ScalarTy getFixedValue() const
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
This is an optimization pass for GlobalISel generic memory operations.
unsigned getPointerAddressSpace(const Type *T)
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
This struct is a compact representation of a valid (non-zero power of two) alignment.