LLVM 20.0.0git
X86InterleavedAccess.cpp
Go to the documentation of this file.
1//===- X86InterleavedAccess.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file contains the X86 implementation of the interleaved accesses
11/// optimization generating X86-specific instructions/intrinsics for
12/// interleaved access groups.
13//
14//===----------------------------------------------------------------------===//
15
16#include "X86ISelLowering.h"
17#include "X86Subtarget.h"
18#include "llvm/ADT/ArrayRef.h"
22#include "llvm/IR/DataLayout.h"
24#include "llvm/IR/IRBuilder.h"
25#include "llvm/IR/Instruction.h"
27#include "llvm/IR/Type.h"
28#include "llvm/IR/Value.h"
30#include <algorithm>
31#include <cassert>
32#include <cmath>
33
34using namespace llvm;
35
36namespace {
37
38/// This class holds necessary information to represent an interleaved
39/// access group and supports utilities to lower the group into
40/// X86-specific instructions/intrinsics.
41/// E.g. A group of interleaving access loads (Factor = 2; accessing every
42/// other element)
43/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
44/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
45/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
46class X86InterleavedAccessGroup {
47 /// Reference to the wide-load instruction of an interleaved access
48 /// group.
49 Instruction *const Inst;
50
51 /// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
53
54 /// Reference to the starting index of each user-shuffle.
55 ArrayRef<unsigned> Indices;
56
57 /// Reference to the interleaving stride in terms of elements.
58 const unsigned Factor;
59
60 /// Reference to the underlying target.
61 const X86Subtarget &Subtarget;
62
63 const DataLayout &DL;
64
65 IRBuilder<> &Builder;
66
67 /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
68 /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
69 void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
70 SmallVectorImpl<Instruction *> &DecomposedVectors);
71
72 /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
73 /// returns the transposed-vectors in \p TransposedVectors.
74 /// E.g.
75 /// InputVectors:
76 /// In-V0 = p1, p2, p3, p4
77 /// In-V1 = q1, q2, q3, q4
78 /// In-V2 = r1, r2, r3, r4
79 /// In-V3 = s1, s2, s3, s4
80 /// OutputVectors:
81 /// Out-V0 = p1, q1, r1, s1
82 /// Out-V1 = p2, q2, r2, s2
83 /// Out-V2 = p3, q3, r3, s3
84 /// Out-V3 = P4, q4, r4, s4
85 void transpose_4x4(ArrayRef<Instruction *> InputVectors,
86 SmallVectorImpl<Value *> &TransposedMatrix);
87 void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
88 SmallVectorImpl<Value *> &TransposedMatrix,
89 unsigned NumSubVecElems);
90 void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
91 SmallVectorImpl<Value *> &TransposedMatrix);
92 void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
93 SmallVectorImpl<Value *> &TransposedMatrix,
94 unsigned NumSubVecElems);
95 void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96 SmallVectorImpl<Value *> &TransposedMatrix,
97 unsigned NumSubVecElems);
98
99public:
100 /// In order to form an interleaved access group X86InterleavedAccessGroup
101 /// requires a wide-load instruction \p 'I', a group of interleaved-vectors
102 /// \p Shuffs, reference to the first indices of each interleaved-vector
103 /// \p 'Ind' and the interleaving stride factor \p F. In order to generate
104 /// X86-specific instructions/intrinsics it also requires the underlying
105 /// target information \p STarget.
106 explicit X86InterleavedAccessGroup(Instruction *I,
108 ArrayRef<unsigned> Ind, const unsigned F,
109 const X86Subtarget &STarget,
110 IRBuilder<> &B)
111 : Inst(I), Shuffles(Shuffs), Indices(Ind), Factor(F), Subtarget(STarget),
112 DL(Inst->getDataLayout()), Builder(B) {}
113
114 /// Returns true if this interleaved access group can be lowered into
115 /// x86-specific instructions/intrinsics, false otherwise.
116 bool isSupported() const;
117
118 /// Lowers this interleaved access group into X86-specific
119 /// instructions/intrinsics.
120 bool lowerIntoOptimizedSequence();
121};
122
123} // end anonymous namespace
124
125bool X86InterleavedAccessGroup::isSupported() const {
126 VectorType *ShuffleVecTy = Shuffles[0]->getType();
127 Type *ShuffleEltTy = ShuffleVecTy->getElementType();
128 unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
129 unsigned WideInstSize;
130
131 // Currently, lowering is supported for the following vectors:
132 // Stride 4:
133 // 1. Store and load of 4-element vectors of 64 bits on AVX.
134 // 2. Store of 16/32-element vectors of 8 bits on AVX.
135 // Stride 3:
136 // 1. Load of 16/32-element vectors of 8 bits on AVX.
137 if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
138 return false;
139
140 if (isa<LoadInst>(Inst)) {
141 WideInstSize = DL.getTypeSizeInBits(Inst->getType());
142 if (cast<LoadInst>(Inst)->getPointerAddressSpace())
143 return false;
144 } else
145 WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
146
147 // We support shuffle represents stride 4 for byte type with size of
148 // WideInstSize.
149 if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
150 return true;
151
152 if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
153 (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
154 WideInstSize == 2048))
155 return true;
156
157 if (ShuffleElemSize == 8 && Factor == 3 &&
158 (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
159 return true;
160
161 return false;
162}
163
164void X86InterleavedAccessGroup::decompose(
165 Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
166 SmallVectorImpl<Instruction *> &DecomposedVectors) {
167 assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
168 "Expected Load or Shuffle");
169
170 Type *VecWidth = VecInst->getType();
171 (void)VecWidth;
172 assert(VecWidth->isVectorTy() &&
173 DL.getTypeSizeInBits(VecWidth) >=
174 DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
175 "Invalid Inst-size!!!");
176
177 if (auto *SVI = dyn_cast<ShuffleVectorInst>(VecInst)) {
178 Value *Op0 = SVI->getOperand(0);
179 Value *Op1 = SVI->getOperand(1);
180
181 // Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
182 for (unsigned i = 0; i < NumSubVectors; ++i)
183 DecomposedVectors.push_back(
184 cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
185 Op0, Op1,
186 createSequentialMask(Indices[i], SubVecTy->getNumElements(),
187 0))));
188 return;
189 }
190
191 // Decompose the load instruction.
192 LoadInst *LI = cast<LoadInst>(VecInst);
193 Type *VecBaseTy;
194 unsigned int NumLoads = NumSubVectors;
195 // In the case of stride 3 with a vector of 32 elements load the information
196 // in the following way:
197 // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
198 unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
199 Value *VecBasePtr = LI->getPointerOperand();
200 if (VecLength == 768 || VecLength == 1536) {
201 VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
202 NumLoads = NumSubVectors * (VecLength / 384);
203 } else {
204 VecBaseTy = SubVecTy;
205 }
206 // Generate N loads of T type.
208 "VecBaseTy's size must be a multiple of 8");
209 const Align FirstAlignment = LI->getAlign();
210 const Align SubsequentAlignment = commonAlignment(
211 FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / 8);
212 Align Alignment = FirstAlignment;
213 for (unsigned i = 0; i < NumLoads; i++) {
214 // TODO: Support inbounds GEP.
215 Value *NewBasePtr =
216 Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
217 Instruction *NewLoad =
218 Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
219 DecomposedVectors.push_back(NewLoad);
220 Alignment = SubsequentAlignment;
221 }
222}
223
224// Changing the scale of the vector type by reducing the number of elements and
225// doubling the scalar size.
227 unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
228 return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
229 VT.getVectorNumElements() / 2);
230}
231
232static constexpr int Concat[] = {
233 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
234 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
235 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
236 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
237
238// genShuffleBland - Creates shuffle according to two vectors.This function is
239// only works on instructions with lane inside 256 registers. According to
240// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
241// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
242// Where the 'LowOffset' refers to the first vector and the highOffset refers to
243// the second vector.
244// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
245// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
246// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
247// For the sequence to work as a mirror to the load.
248// We must consider the elements order as above.
249// In this function we are combining two types of shuffles.
250// The first one is vpshufed and the second is a type of "blend" shuffle.
251// By computing the shuffle on a sequence of 16 elements(one lane) and add the
252// correct offset. We are creating a vpsuffed + blend sequence between two
253// shuffles.
254static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
255 SmallVectorImpl<int> &Out, int LowOffset,
256 int HighOffset) {
257 assert(VT.getSizeInBits() >= 256 &&
258 "This function doesn't accept width smaller then 256");
259 unsigned NumOfElm = VT.getVectorNumElements();
260 for (int I : Mask)
261 Out.push_back(I + LowOffset);
262 for (int I : Mask)
263 Out.push_back(I + HighOffset + NumOfElm);
264}
265
266// reorderSubVector returns the data to is the original state. And de-facto is
267// the opposite of the function concatSubVector.
268
269// For VecElems = 16
270// Invec[0] - |0| TransposedMatrix[0] - |0|
271// Invec[1] - |1| => TransposedMatrix[1] - |1|
272// Invec[2] - |2| TransposedMatrix[2] - |2|
273
274// For VecElems = 32
275// Invec[0] - |0|3| TransposedMatrix[0] - |0|1|
276// Invec[1] - |1|4| => TransposedMatrix[1] - |2|3|
277// Invec[2] - |2|5| TransposedMatrix[2] - |4|5|
278
279// For VecElems = 64
280// Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 |
281// Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 |
282// Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
283
284static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
286 unsigned VecElems, unsigned Stride,
287 IRBuilder<> &Builder) {
288
289 if (VecElems == 16) {
290 for (unsigned i = 0; i < Stride; i++)
291 TransposedMatrix[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
292 return;
293 }
294
295 SmallVector<int, 32> OptimizeShuf;
296 Value *Temp[8];
297
298 for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
299 genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
300 (i + 1) / Stride * 16);
301 Temp[i / 2] = Builder.CreateShuffleVector(
302 Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
303 OptimizeShuf.clear();
304 }
305
306 if (VecElems == 32) {
307 std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
308 return;
309 } else
310 for (unsigned i = 0; i < Stride; i++)
311 TransposedMatrix[i] =
312 Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
313}
314
315void X86InterleavedAccessGroup::interleave8bitStride4VF8(
317 SmallVectorImpl<Value *> &TransposedMatrix) {
318 // Assuming we start from the following vectors:
319 // Matrix[0]= c0 c1 c2 c3 c4 ... c7
320 // Matrix[1]= m0 m1 m2 m3 m4 ... m7
321 // Matrix[2]= y0 y1 y2 y3 y4 ... y7
322 // Matrix[3]= k0 k1 k2 k3 k4 ... k7
323
324 MVT VT = MVT::v8i16;
325 TransposedMatrix.resize(2);
326 SmallVector<int, 16> MaskLow;
327 SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
328 SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
329
330 for (unsigned i = 0; i < 8; ++i) {
331 MaskLow.push_back(i);
332 MaskLow.push_back(i + 8);
333 }
334
335 createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
336 createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
337 narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
338 narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
339 // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
340 // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
341 Value *IntrVec1Low =
342 Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
343 Value *IntrVec2Low =
344 Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
345
346 // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
347 // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
348
349 TransposedMatrix[0] =
350 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
351 TransposedMatrix[1] =
352 Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
353}
354
355void X86InterleavedAccessGroup::interleave8bitStride4(
357 unsigned NumOfElm) {
358 // Example: Assuming we start from the following vectors:
359 // Matrix[0]= c0 c1 c2 c3 c4 ... c31
360 // Matrix[1]= m0 m1 m2 m3 m4 ... m31
361 // Matrix[2]= y0 y1 y2 y3 y4 ... y31
362 // Matrix[3]= k0 k1 k2 k3 k4 ... k31
363
364 MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
365 MVT HalfVT = scaleVectorType(VT);
366
367 TransposedMatrix.resize(4);
368 SmallVector<int, 32> MaskHigh;
369 SmallVector<int, 32> MaskLow;
370 SmallVector<int, 32> LowHighMask[2];
371 SmallVector<int, 32> MaskHighTemp;
372 SmallVector<int, 32> MaskLowTemp;
373
374 // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
375 // shuffle pattern.
376
377 createUnpackShuffleMask(VT, MaskLow, true, false);
378 createUnpackShuffleMask(VT, MaskHigh, false, false);
379
380 // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
381 // shuffle pattern.
382
383 createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
384 createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
385 narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
386 narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
387
388 // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
389 // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
390 // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
391 // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
392 Value *IntrVec[4];
393
394 IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
395 IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
396 IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
397 IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
398
399 // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
400 // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
401 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
402 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
403
404 Value *VecOut[4];
405 for (int i = 0; i < 4; i++)
406 VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
407 LowHighMask[i % 2]);
408
409 // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
410 // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
411 // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
412 // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
413
414 if (VT == MVT::v16i8) {
415 std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
416 return;
417 }
418
419 reorderSubVector(VT, TransposedMatrix, VecOut, ArrayRef(Concat, 16), NumOfElm,
420 4, Builder);
421}
422
423// createShuffleStride returns shuffle mask of size N.
424// The shuffle pattern is as following :
425// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
426// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
427// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
428// Where Lane is the # of lanes in a register:
429// VectorSize = 128 => Lane = 1
430// VectorSize = 256 => Lane = 2
431// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
432// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
433static void createShuffleStride(MVT VT, int Stride,
434 SmallVectorImpl<int> &Mask) {
435 int VectorSize = VT.getSizeInBits();
436 int VF = VT.getVectorNumElements();
437 int LaneCount = std::max(VectorSize / 128, 1);
438 for (int Lane = 0; Lane < LaneCount; Lane++)
439 for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
440 Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
441}
442
443// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
444// inside mask a shuffleMask. A mask contains exactly 3 groups, where
445// each group is a monotonically increasing sequence with stride 3.
446// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
447static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
448 int VectorSize = VT.getSizeInBits();
449 int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
450 for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
451 int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
452 SizeInfo.push_back(GroupSize);
453 FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
454 }
455}
456
457// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
458// vpalign works according to lanes
459// Where Lane is the # of lanes in a register:
460// VectorWide = 128 => Lane = 1
461// VectorWide = 256 => Lane = 2
462// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
463// For Lane = 2 shuffle pattern is:
464// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
465// Imm variable sets the offset amount. The result of the
466// function is stored inside ShuffleMask vector and it built as described in
467// the begin of the description. AlignDirection is a boolean that indicates the
468// direction of the alignment. (false - align to the "right" side while true -
469// align to the "left" side)
470static void DecodePALIGNRMask(MVT VT, unsigned Imm,
471 SmallVectorImpl<int> &ShuffleMask,
472 bool AlignDirection = true, bool Unary = false) {
473 unsigned NumElts = VT.getVectorNumElements();
474 unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
475 unsigned NumLaneElts = NumElts / NumLanes;
476
477 Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
478 unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
479
480 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
481 for (unsigned i = 0; i != NumLaneElts; ++i) {
482 unsigned Base = i + Offset;
483 // if i+offset is out of this lane then we actually need the other source
484 // If Unary the other source is the first source.
485 if (Base >= NumLaneElts)
486 Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
487 ShuffleMask.push_back(Base + l);
488 }
489 }
490}
491
492// concatSubVector - The function rebuilds the data to a correct expected
493// order. An assumption(The shape of the matrix) was taken for the
494// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
495// This function ensures that the data is built in correct way for the lane
496// instructions. Each lane inside the vector is a 128-bit length.
497//
498// The 'InVec' argument contains the data in increasing order. In InVec[0] You
499// can find the first 128 bit data. The number of different lanes inside a
500// vector depends on the 'VecElems'.In general, the formula is
501// VecElems * type / 128. The size of the array 'InVec' depends and equal to
502// 'VecElems'.
503
504// For VecElems = 16
505// Invec[0] - |0| Vec[0] - |0|
506// Invec[1] - |1| => Vec[1] - |1|
507// Invec[2] - |2| Vec[2] - |2|
508
509// For VecElems = 32
510// Invec[0] - |0|1| Vec[0] - |0|3|
511// Invec[1] - |2|3| => Vec[1] - |1|4|
512// Invec[2] - |4|5| Vec[2] - |2|5|
513
514// For VecElems = 64
515// Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 |
516// Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10|
517// Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
518
520 unsigned VecElems, IRBuilder<> &Builder) {
521 if (VecElems == 16) {
522 for (int i = 0; i < 3; i++)
523 Vec[i] = InVec[i];
524 return;
525 }
526
527 for (unsigned j = 0; j < VecElems / 32; j++)
528 for (int i = 0; i < 3; i++)
529 Vec[i + j * 3] = Builder.CreateShuffleVector(
530 InVec[j * 6 + i], InVec[j * 6 + i + 3], ArrayRef(Concat, 32));
531
532 if (VecElems == 32)
533 return;
534
535 for (int i = 0; i < 3; i++)
536 Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
537}
538
539void X86InterleavedAccessGroup::deinterleave8bitStride3(
540 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
541 unsigned VecElems) {
542 // Example: Assuming we start from the following vectors:
543 // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
544 // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
545 // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
546
547 TransposedMatrix.resize(3);
549 SmallVector<int, 32> VPAlign[2];
550 SmallVector<int, 32> VPAlign2;
551 SmallVector<int, 32> VPAlign3;
552 SmallVector<int, 3> GroupSize;
553 Value *Vec[6], *TempVector[3];
554
555 MVT VT = MVT::getVT(Shuffles[0]->getType());
556
557 createShuffleStride(VT, 3, VPShuf);
558 setGroupSize(VT, GroupSize);
559
560 for (int i = 0; i < 2; i++)
561 DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
562
563 DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
564 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
565
566 concatSubVector(Vec, InVec, VecElems, Builder);
567 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
568 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
569 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
570
571 for (int i = 0; i < 3; i++)
572 Vec[i] = Builder.CreateShuffleVector(Vec[i], VPShuf);
573
574 // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
575 // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
576 // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
577
578 for (int i = 0; i < 3; i++)
579 TempVector[i] =
580 Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
581
582 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
583 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
584 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
585
586 for (int i = 0; i < 3; i++)
587 Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
588 VPAlign[1]);
589
590 // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
591 // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
592 // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
593
594 Value *TempVec = Builder.CreateShuffleVector(Vec[1], VPAlign3);
595 TransposedMatrix[0] = Builder.CreateShuffleVector(Vec[0], VPAlign2);
596 TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
597 TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
598}
599
600// group2Shuffle reorder the shuffle stride back into continuous order.
601// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
602// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
604 SmallVectorImpl<int> &Output) {
605 int IndexGroup[3] = {0, 0, 0};
606 int Index = 0;
607 int VectorWidth = VT.getSizeInBits();
608 int VF = VT.getVectorNumElements();
609 // Find the index of the different groups.
610 int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
611 for (int i = 0; i < 3; i++) {
612 IndexGroup[(Index * 3) % (VF / Lane)] = Index;
613 Index += Mask[i];
614 }
615 // According to the index compute the convert mask.
616 for (int i = 0; i < VF / Lane; i++) {
617 Output.push_back(IndexGroup[i % 3]);
618 IndexGroup[i % 3]++;
619 }
620}
621
622void X86InterleavedAccessGroup::interleave8bitStride3(
623 ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
624 unsigned VecElems) {
625 // Example: Assuming we start from the following vectors:
626 // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
627 // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
628 // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
629
630 TransposedMatrix.resize(3);
631 SmallVector<int, 3> GroupSize;
633 SmallVector<int, 32> VPAlign[3];
634 SmallVector<int, 32> VPAlign2;
635 SmallVector<int, 32> VPAlign3;
636
637 Value *Vec[3], *TempVector[3];
638 MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
639
640 setGroupSize(VT, GroupSize);
641
642 for (int i = 0; i < 3; i++)
643 DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
644
645 DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
646 DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
647
648 // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
649 // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
650 // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
651
652 Vec[0] = Builder.CreateShuffleVector(InVec[0], VPAlign2);
653 Vec[1] = Builder.CreateShuffleVector(InVec[1], VPAlign3);
654 Vec[2] = InVec[2];
655
656 // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
657 // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
658 // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
659
660 for (int i = 0; i < 3; i++)
661 TempVector[i] =
662 Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
663
664 // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
665 // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
666 // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
667
668 for (int i = 0; i < 3; i++)
669 Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
670 VPAlign[2]);
671
672 // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
673 // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
674 // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
675
676 unsigned NumOfElm = VT.getVectorNumElements();
677 group2Shuffle(VT, GroupSize, VPShuf);
678 reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
679}
680
681void X86InterleavedAccessGroup::transpose_4x4(
683 SmallVectorImpl<Value *> &TransposedMatrix) {
684 assert(Matrix.size() == 4 && "Invalid matrix size");
685 TransposedMatrix.resize(4);
686
687 // dst = src1[0,1],src2[0,1]
688 static constexpr int IntMask1[] = {0, 1, 4, 5};
689 ArrayRef<int> Mask = ArrayRef(IntMask1, 4);
690 Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
691 Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
692
693 // dst = src1[2,3],src2[2,3]
694 static constexpr int IntMask2[] = {2, 3, 6, 7};
695 Mask = ArrayRef(IntMask2, 4);
696 Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
697 Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
698
699 // dst = src1[0],src2[0],src1[2],src2[2]
700 static constexpr int IntMask3[] = {0, 4, 2, 6};
701 Mask = ArrayRef(IntMask3, 4);
702 TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
703 TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
704
705 // dst = src1[1],src2[1],src1[3],src2[3]
706 static constexpr int IntMask4[] = {1, 5, 3, 7};
707 Mask = ArrayRef(IntMask4, 4);
708 TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
709 TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
710}
711
712// Lowers this interleaved access group into X86-specific
713// instructions/intrinsics.
714bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
715 SmallVector<Instruction *, 4> DecomposedVectors;
716 SmallVector<Value *, 4> TransposedVectors;
717 auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
718
719 if (isa<LoadInst>(Inst)) {
720 auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
721 unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
722 switch (NumSubVecElems) {
723 default:
724 return false;
725 case 4:
726 case 8:
727 case 16:
728 case 32:
729 case 64:
730 if (ShuffleTy->getNumElements() != NumSubVecElems)
731 return false;
732 break;
733 }
734
735 // Try to generate target-sized register(/instruction).
736 decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
737
738 // Perform matrix-transposition in order to compute interleaved
739 // results by generating some sort of (optimized) target-specific
740 // instructions.
741
742 if (NumSubVecElems == 4)
743 transpose_4x4(DecomposedVectors, TransposedVectors);
744 else
745 deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
746 NumSubVecElems);
747
748 // Now replace the unoptimized-interleaved-vectors with the
749 // transposed-interleaved vectors.
750 for (unsigned i = 0, e = Shuffles.size(); i < e; ++i)
751 Shuffles[i]->replaceAllUsesWith(TransposedVectors[Indices[i]]);
752
753 return true;
754 }
755
756 Type *ShuffleEltTy = ShuffleTy->getElementType();
757 unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
758
759 // Lower the interleaved stores:
760 // 1. Decompose the interleaved wide shuffle into individual shuffle
761 // vectors.
762 decompose(Shuffles[0], Factor,
763 FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
764 DecomposedVectors);
765
766 // 2. Transpose the interleaved-vectors into vectors of contiguous
767 // elements.
768 switch (NumSubVecElems) {
769 case 4:
770 transpose_4x4(DecomposedVectors, TransposedVectors);
771 break;
772 case 8:
773 interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
774 break;
775 case 16:
776 case 32:
777 case 64:
778 if (Factor == 4)
779 interleave8bitStride4(DecomposedVectors, TransposedVectors,
780 NumSubVecElems);
781 if (Factor == 3)
782 interleave8bitStride3(DecomposedVectors, TransposedVectors,
783 NumSubVecElems);
784 break;
785 default:
786 return false;
787 }
788
789 // 3. Concatenate the contiguous-vectors back into a wide vector.
790 Value *WideVec = concatenateVectors(Builder, TransposedVectors);
791
792 // 4. Generate a store instruction for wide-vec.
793 StoreInst *SI = cast<StoreInst>(Inst);
794 Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
795
796 return true;
797}
798
799// Lower interleaved load(s) into target specific instructions/
800// intrinsics. Lowering sequence varies depending on the vector-types, factor,
801// number of shuffles and ISA.
802// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
805 ArrayRef<unsigned> Indices, unsigned Factor) const {
806 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
807 "Invalid interleave factor");
808 assert(!Shuffles.empty() && "Empty shufflevector input");
809 assert(Shuffles.size() == Indices.size() &&
810 "Unmatched number of shufflevectors and indices");
811
812 // Create an interleaved access group.
813 IRBuilder<> Builder(LI);
814 X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
815 Builder);
816
817 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
818}
819
822 unsigned Factor) const {
823 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
824 "Invalid interleave factor");
825
826 assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
827 0 &&
828 "Invalid interleaved store");
829
830 // Holds the indices of SVI that correspond to the starting index of each
831 // interleaved shuffle.
833 auto Mask = SVI->getShuffleMask();
834 for (unsigned i = 0; i < Factor; i++)
835 Indices.push_back(Mask[i]);
836
838
839 // Create an interleaved access group.
840 IRBuilder<> Builder(SI);
841 X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
842 Builder);
843
844 return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
845}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static Decomposition decompose(Value *V, SmallVectorImpl< ConditionTy > &Preconditions, bool IsSigned, const DataLayout &DL)
Live Register Matrix
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
static void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl< int > &ShuffleMask, bool AlignDirection=true, bool Unary=false)
static void genShuffleBland(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &Out, int LowOffset, int HighOffset)
static MVT scaleVectorType(MVT VT)
static constexpr int Concat[]
static void group2Shuffle(MVT VT, SmallVectorImpl< int > &Mask, SmallVectorImpl< int > &Output)
static void createShuffleStride(MVT VT, int Stride, SmallVectorImpl< int > &Mask)
static void concatSubVector(Value **Vec, ArrayRef< Instruction * > InVec, unsigned VecElems, IRBuilder<> &Builder)
static void setGroupSize(MVT VT, SmallVectorImpl< int > &SizeInfo)
static void reorderSubVector(MVT VT, SmallVectorImpl< Value * > &TransposedMatrix, ArrayRef< Value * > Vec, ArrayRef< int > VPShuf, unsigned VecElems, unsigned Stride, IRBuilder<> &Builder)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2525
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2697
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:237
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
static IntegerType * getInt8Ty(LLVMContext &C)
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower interleaved load(s) into target specific instructions/intrinsics.
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower interleaved store(s) into target specific instructions/intrinsics.
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition: TypeSize.h:183
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
unsigned getPointerAddressSpace(const Type *T)
Definition: SPIRVUtils.h:256
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39