49 class X86InterleavedAccessGroup {
61 const unsigned Factor;
92 unsigned NumSubVecElems);
97 unsigned NumSubVecElems);
100 unsigned NumSubVecElems);
114 : Inst(
I), Shuffles(Shuffs), Indices(Ind), Factor(
F), Subtarget(STarget),
115 DL(Inst->getModule()->getDataLayout()),
Builder(
B) {}
119 bool isSupported()
const;
123 bool lowerIntoOptimizedSequence();
128 bool X86InterleavedAccessGroup::isSupported()
const {
129 VectorType *ShuffleVecTy = Shuffles[0]->getType();
131 unsigned ShuffleElemSize =
DL.getTypeSizeInBits(ShuffleEltTy);
132 unsigned WideInstSize;
140 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
143 if (isa<LoadInst>(Inst)) {
144 WideInstSize =
DL.getTypeSizeInBits(Inst->getType());
145 if (cast<LoadInst>(Inst)->getPointerAddressSpace())
148 WideInstSize =
DL.getTypeSizeInBits(Shuffles[0]->
getType());
152 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
155 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
156 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
157 WideInstSize == 2048))
160 if (ShuffleElemSize == 8 && Factor == 3 &&
161 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
170 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
171 "Expected Load or Shuffle");
176 DL.getTypeSizeInBits(VecWidth) >=
177 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
178 "Invalid Inst-size!!!");
180 if (
auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
181 Value *Op0 = SVI->getOperand(0);
182 Value *Op1 = SVI->getOperand(1);
185 for (
unsigned i = 0;
i < NumSubVectors; ++
i)
186 DecomposedVectors.push_back(
187 cast<ShuffleVectorInst>(
Builder.CreateShuffleVector(
195 LoadInst *LI = cast<LoadInst>(VecInst);
196 Type *VecBaseTy, *VecBasePtrTy;
198 unsigned int NumLoads = NumSubVectors;
202 unsigned VecLength =
DL.getTypeSizeInBits(VecWidth);
203 if (VecLength == 768 || VecLength == 1536) {
207 NumLoads = NumSubVectors * (VecLength / 384);
209 VecBaseTy = SubVecTy;
215 "VecBaseTy's size must be a multiple of 8");
219 Align Alignment = FirstAlignment;
220 for (
unsigned i = 0;
i < NumLoads;
i++) {
225 Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
226 DecomposedVectors.push_back(NewLoad);
227 Alignment = SubsequentAlignment;
240 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
241 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
242 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
243 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
265 "This function doesn't accept width smaller then 256");
267 for (
unsigned i = 0;
i <
Mask.size();
i++)
268 Out.push_back(
Mask[
i] + LowOffset);
269 for (
unsigned i = 0;
i <
Mask.size();
i++)
270 Out.push_back(
Mask[
i] + HighOffset + NumOfElm);
293 unsigned VecElems,
unsigned Stride,
296 if (VecElems == 16) {
297 for (
unsigned i = 0;
i < Stride;
i++)
298 TransposedMatrix[
i] =
Builder.CreateShuffleVector(Vec[
i], VPShuf);
305 for (
unsigned i = 0;
i < (VecElems / 16) * Stride;
i += 2) {
307 (
i + 1) / Stride * 16);
308 Temp[
i / 2] =
Builder.CreateShuffleVector(
309 Vec[
i % Stride], Vec[(
i + 1) % Stride], OptimizeShuf);
310 OptimizeShuf.
clear();
313 if (VecElems == 32) {
314 std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
317 for (
unsigned i = 0;
i < Stride;
i++)
318 TransposedMatrix[
i] =
322 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
332 TransposedMatrix.
resize(2);
337 for (
unsigned i = 0;
i < 8; ++
i) {
338 MaskLow.push_back(
i);
339 MaskLow.push_back(
i + 8);
356 TransposedMatrix[0] =
357 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
358 TransposedMatrix[1] =
359 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
362 void X86InterleavedAccessGroup::interleave8bitStride4(
374 TransposedMatrix.
resize(4);
412 for (
int i = 0;
i < 4;
i++)
413 VecOut[
i] =
Builder.CreateShuffleVector(IntrVec[
i / 2], IntrVec[
i / 2 + 2],
422 std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
444 int LaneCount =
std::max(VectorSize / 128, 1);
445 for (
int Lane = 0; Lane < LaneCount; Lane++)
446 for (
int i = 0, LaneSize = VF / LaneCount;
i != LaneSize; ++
i)
447 Mask.push_back((
i * Stride) % LaneSize + LaneSize * Lane);
457 for (
int i = 0, FirstGroupElement = 0;
i < 3;
i++) {
458 int GroupSize =
std::ceil((VF - FirstGroupElement) / 3.0);
459 SizeInfo.push_back(GroupSize);
460 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
479 bool AlignDirection =
true,
bool Unary =
false) {
482 unsigned NumLaneElts = NumElts / NumLanes;
484 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
487 for (
unsigned l = 0;
l != NumElts;
l += NumLaneElts) {
488 for (
unsigned i = 0;
i != NumLaneElts; ++
i) {
492 if (Base >= NumLaneElts)
493 Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
494 ShuffleMask.push_back(Base +
l);
528 if (VecElems == 16) {
529 for (
int i = 0;
i < 3;
i++)
534 for (
unsigned j = 0;
j < VecElems / 32;
j++)
535 for (
int i = 0;
i < 3;
i++)
536 Vec[
i +
j * 3] =
Builder.CreateShuffleVector(
542 for (
int i = 0;
i < 3;
i++)
546 void X86InterleavedAccessGroup::deinterleave8bitStride3(
554 TransposedMatrix.
resize(3);
560 Value *Vec[6], *TempVector[3];
567 for (
int i = 0;
i < 2;
i++)
578 for (
int i = 0;
i < 3;
i++)
579 Vec[
i] =
Builder.CreateShuffleVector(Vec[
i], VPShuf);
585 for (
int i = 0;
i < 3;
i++)
587 Builder.CreateShuffleVector(Vec[(
i + 2) % 3], Vec[
i], VPAlign[0]);
593 for (
int i = 0;
i < 3;
i++)
594 Vec[
i] =
Builder.CreateShuffleVector(TempVector[(
i + 1) % 3], TempVector[
i],
601 Value *TempVec =
Builder.CreateShuffleVector(Vec[1], VPAlign3);
602 TransposedMatrix[0] =
Builder.CreateShuffleVector(Vec[0], VPAlign2);
603 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
604 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
612 int IndexGroup[3] = {0, 0, 0};
617 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
618 for (
int i = 0;
i < 3;
i++) {
619 IndexGroup[(
Index * 3) % (VF / Lane)] =
Index;
623 for (
int i = 0;
i < VF / Lane;
i++) {
624 Output.push_back(IndexGroup[
i % 3]);
629 void X86InterleavedAccessGroup::interleave8bitStride3(
637 TransposedMatrix.
resize(3);
644 Value *Vec[3], *TempVector[3];
649 for (
int i = 0;
i < 3;
i++)
659 Vec[0] =
Builder.CreateShuffleVector(InVec[0], VPAlign2);
660 Vec[1] =
Builder.CreateShuffleVector(InVec[1], VPAlign3);
667 for (
int i = 0;
i < 3;
i++)
669 Builder.CreateShuffleVector(Vec[
i], Vec[(
i + 2) % 3], VPAlign[1]);
675 for (
int i = 0;
i < 3;
i++)
676 Vec[
i] =
Builder.CreateShuffleVector(TempVector[
i], TempVector[(
i + 1) % 3],
688 void X86InterleavedAccessGroup::transpose_4x4(
692 TransposedMatrix.
resize(4);
695 static constexpr
int IntMask1[] = {0, 1, 4, 5};
701 static constexpr
int IntMask2[] = {2, 3, 6, 7};
707 static constexpr
int IntMask3[] = {0, 4, 2, 6};
709 TransposedMatrix[0] =
Builder.CreateShuffleVector(IntrVec1, IntrVec2,
Mask);
710 TransposedMatrix[2] =
Builder.CreateShuffleVector(IntrVec3, IntrVec4,
Mask);
713 static constexpr
int IntMask4[] = {1, 5, 3, 7};
715 TransposedMatrix[1] =
Builder.CreateShuffleVector(IntrVec1, IntrVec2,
Mask);
716 TransposedMatrix[3] =
Builder.CreateShuffleVector(IntrVec3, IntrVec4,
Mask);
721 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
724 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->
getType());
726 if (isa<LoadInst>(Inst)) {
728 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
730 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
731 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
736 switch (NumSubVecElems) {
740 transpose_4x4(DecomposedVectors, TransposedVectors);
746 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
753 for (
unsigned i = 0,
e = Shuffles.size();
i <
e; ++
i)
754 Shuffles[
i]->replaceAllUsesWith(TransposedVectors[Indices[
i]]);
759 Type *ShuffleEltTy = ShuffleTy->getElementType();
760 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
771 switch (NumSubVecElems) {
773 transpose_4x4(DecomposedVectors, TransposedVectors);
776 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
782 interleave8bitStride4(DecomposedVectors, TransposedVectors,
785 interleave8bitStride3(DecomposedVectors, TransposedVectors,
797 Builder.CreateAlignedStore(WideVec,
SI->getPointerOperand(),
SI->getAlign());
810 "Invalid interleave factor");
811 assert(!Shuffles.
empty() &&
"Empty shufflevector input");
813 "Unmatched number of shufflevectors and indices");
817 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
820 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
825 unsigned Factor)
const {
827 "Invalid interleave factor");
829 assert(cast<FixedVectorType>(SVI->
getType())->getNumElements() % Factor ==
831 "Invalid interleaved store");
837 for (
unsigned i = 0;
i < Factor;
i++)
838 Indices.push_back(
Mask[
i]);
844 X86InterleavedAccessGroup Grp(
SI, Shuffles, Indices, Factor, Subtarget,
847 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();