49 class X86InterleavedAccessGroup {
61 const unsigned Factor;
92 unsigned NumSubVecElems);
97 unsigned NumSubVecElems);
100 unsigned NumSubVecElems);
114 : Inst(
I), Shuffles(Shuffs), Indices(Ind), Factor(
F), Subtarget(STarget),
115 DL(Inst->getModule()->getDataLayout()),
Builder(
B) {}
119 bool isSupported()
const;
123 bool lowerIntoOptimizedSequence();
128 bool X86InterleavedAccessGroup::isSupported()
const {
129 VectorType *ShuffleVecTy = Shuffles[0]->getType();
131 unsigned ShuffleElemSize =
DL.getTypeSizeInBits(ShuffleEltTy);
132 unsigned WideInstSize;
140 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
143 if (isa<LoadInst>(Inst)) {
144 WideInstSize =
DL.getTypeSizeInBits(Inst->getType());
145 if (cast<LoadInst>(Inst)->getPointerAddressSpace())
148 WideInstSize =
DL.getTypeSizeInBits(Shuffles[0]->
getType());
152 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
155 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157 WideInstSize == 2048))
160 if (ShuffleElemSize == 8 && Factor == 3 &&
161 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
170 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171 "Expected Load or Shuffle");
176 DL.getTypeSizeInBits(VecWidth) >=
177 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178 "Invalid Inst-size!!!");
180 if (
auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181 Value *Op0 = SVI->getOperand(0);
182 Value *Op1 = SVI->getOperand(1);
185 for (
unsigned i = 0;
i < NumSubVectors; ++
i)
186 DecomposedVectors.push_back(
187 cast<ShuffleVectorInst>(
Builder.CreateShuffleVector(
195 LoadInst *LI = cast<LoadInst>(VecInst);
196 Type *VecBaseTy, *VecBasePtrTy;
198 unsigned int NumLoads = NumSubVectors;
202 unsigned VecLength =
DL.getTypeSizeInBits(VecWidth);
203 if (VecLength == 768 || VecLength == 1536) {
207 NumLoads = NumSubVectors * (VecLength / 384);
209 VecBaseTy = SubVecTy;
215 "VecBaseTy's size must be a multiple of 8");
220 for (
unsigned i = 0;
i < NumLoads;
i++) {
225 Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
226 DecomposedVectors.push_back(NewLoad);
240 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
241 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
242 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
243 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
265 "This function doesn't accept width smaller then 256");
267 for (
unsigned i = 0;
i <
Mask.size();
i++)
268 Out.push_back(
Mask[
i] + LowOffset);
269 for (
unsigned i = 0;
i <
Mask.size();
i++)
270 Out.push_back(
Mask[
i] + HighOffset + NumOfElm);
293 unsigned VecElems,
unsigned Stride,
296 if (VecElems == 16) {
297 for (
unsigned i = 0;
i < Stride;
i++)
298 TransposedMatrix[
i] =
Builder.CreateShuffleVector(Vec[
i], VPShuf);
305 for (
unsigned i = 0;
i < (VecElems / 16) * Stride;
i += 2) {
307 (
i + 1) / Stride * 16);
308 Temp[
i / 2] =
Builder.CreateShuffleVector(
309 Vec[
i % Stride], Vec[(
i + 1) % Stride], OptimizeShuf);
310 OptimizeShuf.
clear();
313 if (VecElems == 32) {
314 std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
317 for (
unsigned i = 0;
i < Stride;
i++)
318 TransposedMatrix[
i] =
322 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
332 TransposedMatrix.
resize(2);
337 for (
unsigned i = 0;
i < 8; ++
i) {
338 MaskLow.push_back(
i);
339 MaskLow.push_back(
i + 8);
349 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
351 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
356 TransposedMatrix[0] =
357 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
358 TransposedMatrix[1] =
359 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
362 void X86InterleavedAccessGroup::interleave8bitStride4(
374 TransposedMatrix.
resize(4);
401 IntrVec[0] =
Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
402 IntrVec[1] =
Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
403 IntrVec[2] =
Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
404 IntrVec[3] =
Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
412 for (
int i = 0;
i < 4;
i++)
413 VecOut[
i] =
Builder.CreateShuffleVector(IntrVec[
i / 2], IntrVec[
i / 2 + 2],
422 std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
444 int LaneCount =
std::max(VectorSize / 128, 1);
445 for (
int Lane = 0; Lane < LaneCount; Lane++)
446 for (
int i = 0, LaneSize = VF / LaneCount;
i != LaneSize; ++
i)
447 Mask.push_back((
i * Stride) % LaneSize + LaneSize * Lane);
457 for (
int i = 0, FirstGroupElement = 0;
i < 3;
i++) {
458 int GroupSize =
std::ceil((VF - FirstGroupElement) / 3.0);
459 SizeInfo.push_back(GroupSize);
460 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
479 bool AlignDirection =
true,
bool Unary =
false) {
482 unsigned NumLaneElts = NumElts / NumLanes;
484 Imm = AlignDirection ?
Imm : (NumLaneElts -
Imm);
487 for (
unsigned l = 0;
l != NumElts;
l += NumLaneElts) {
488 for (
unsigned i = 0;
i != NumLaneElts; ++
i) {
489 unsigned Base =
i + Offset;
492 if (
Base >= NumLaneElts)
493 Base = Unary ?
Base % NumLaneElts :
Base + NumElts - NumLaneElts;
494 ShuffleMask.push_back(
Base +
l);
528 if (VecElems == 16) {
529 for (
int i = 0;
i < 3;
i++)
534 for (
unsigned j = 0;
j < VecElems / 32;
j++)
535 for (
int i = 0;
i < 3;
i++)
536 Vec[
i +
j * 3] =
Builder.CreateShuffleVector(
542 for (
int i = 0;
i < 3;
i++)
546 void X86InterleavedAccessGroup::deinterleave8bitStride3(
554 TransposedMatrix.
resize(3);
560 Value *Vec[6], *TempVector[3];
567 for (
int i = 0;
i < 2;
i++)
578 for (
int i = 0;
i < 3;
i++)
579 Vec[
i] =
Builder.CreateShuffleVector(Vec[
i], VPShuf);
585 for (
int i = 0;
i < 3;
i++)
587 Builder.CreateShuffleVector(Vec[(
i + 2) % 3], Vec[
i], VPAlign[0]);
593 for (
int i = 0;
i < 3;
i++)
594 Vec[
i] =
Builder.CreateShuffleVector(TempVector[(
i + 1) % 3], TempVector[
i],
601 Value *TempVec =
Builder.CreateShuffleVector(Vec[1], VPAlign3);
602 TransposedMatrix[0] =
Builder.CreateShuffleVector(Vec[0], VPAlign2);
603 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
604 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
612 int IndexGroup[3] = {0, 0, 0};
617 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
618 for (
int i = 0;
i < 3;
i++) {
619 IndexGroup[(Index * 3) % (VF / Lane)] = Index;
623 for (
int i = 0;
i < VF / Lane;
i++) {
624 Output.push_back(IndexGroup[
i % 3]);
629 void X86InterleavedAccessGroup::interleave8bitStride3(
637 TransposedMatrix.
resize(3);
644 Value *Vec[3], *TempVector[3];
649 for (
int i = 0;
i < 3;
i++)
659 Vec[0] =
Builder.CreateShuffleVector(InVec[0], VPAlign2);
660 Vec[1] =
Builder.CreateShuffleVector(InVec[1], VPAlign3);
667 for (
int i = 0;
i < 3;
i++)
669 Builder.CreateShuffleVector(Vec[
i], Vec[(
i + 2) % 3], VPAlign[1]);
675 for (
int i = 0;
i < 3;
i++)
676 Vec[
i] =
Builder.CreateShuffleVector(TempVector[
i], TempVector[(
i + 1) % 3],
688 void X86InterleavedAccessGroup::transpose_4x4(
692 TransposedMatrix.
resize(4);
695 static constexpr
int IntMask1[] = {0, 1, 4, 5};
701 static constexpr
int IntMask2[] = {2, 3, 6, 7};
707 static constexpr
int IntMask3[] = {0, 4, 2, 6};
709 TransposedMatrix[0] =
Builder.CreateShuffleVector(IntrVec1, IntrVec2,
Mask);
710 TransposedMatrix[2] =
Builder.CreateShuffleVector(IntrVec3, IntrVec4,
Mask);
713 static constexpr
int IntMask4[] = {1, 5, 3, 7};
715 TransposedMatrix[1] =
Builder.CreateShuffleVector(IntrVec1, IntrVec2,
Mask);
716 TransposedMatrix[3] =
Builder.CreateShuffleVector(IntrVec3, IntrVec4,
Mask);
721 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
724 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->
getType());
726 if (isa<LoadInst>(Inst)) {
727 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
728 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
729 switch (NumSubVecElems) {
737 if (ShuffleTy->getNumElements() != NumSubVecElems)
743 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
749 if (NumSubVecElems == 4)
750 transpose_4x4(DecomposedVectors, TransposedVectors);
752 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
757 for (
unsigned i = 0,
e = Shuffles.size();
i <
e; ++
i)
758 Shuffles[
i]->replaceAllUsesWith(TransposedVectors[Indices[
i]]);
763 Type *ShuffleEltTy = ShuffleTy->getElementType();
764 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
775 switch (NumSubVecElems) {
777 transpose_4x4(DecomposedVectors, TransposedVectors);
780 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
786 interleave8bitStride4(DecomposedVectors, TransposedVectors,
789 interleave8bitStride3(DecomposedVectors, TransposedVectors,
801 Builder.CreateAlignedStore(WideVec,
SI->getPointerOperand(),
SI->getAlign());
814 "Invalid interleave factor");
815 assert(!Shuffles.
empty() &&
"Empty shufflevector input");
817 "Unmatched number of shufflevectors and indices");
821 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
824 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
829 unsigned Factor)
const {
831 "Invalid interleave factor");
833 assert(cast<FixedVectorType>(SVI->
getType())->getNumElements() % Factor ==
835 "Invalid interleaved store");
841 for (
unsigned i = 0;
i < Factor;
i++)
842 Indices.push_back(
Mask[
i]);
848 X86InterleavedAccessGroup Grp(
SI, Shuffles, Indices, Factor, Subtarget,
851 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();