Line data Source code
1 : //===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // This file defines vectorizer utilities.
11 : //
12 : //===----------------------------------------------------------------------===//
13 :
14 : #include "llvm/Analysis/VectorUtils.h"
15 : #include "llvm/ADT/EquivalenceClasses.h"
16 : #include "llvm/Analysis/DemandedBits.h"
17 : #include "llvm/Analysis/LoopInfo.h"
18 : #include "llvm/Analysis/LoopIterator.h"
19 : #include "llvm/Analysis/ScalarEvolution.h"
20 : #include "llvm/Analysis/ScalarEvolutionExpressions.h"
21 : #include "llvm/Analysis/TargetTransformInfo.h"
22 : #include "llvm/Analysis/ValueTracking.h"
23 : #include "llvm/IR/Constants.h"
24 : #include "llvm/IR/GetElementPtrTypeIterator.h"
25 : #include "llvm/IR/IRBuilder.h"
26 : #include "llvm/IR/PatternMatch.h"
27 : #include "llvm/IR/Value.h"
28 :
29 : #define DEBUG_TYPE "vectorutils"
30 :
31 : using namespace llvm;
32 : using namespace llvm::PatternMatch;
33 :
34 : /// Maximum factor for an interleaved memory access.
35 : static cl::opt<unsigned> MaxInterleaveGroupFactor(
36 : "max-interleave-group-factor", cl::Hidden,
37 : cl::desc("Maximum factor for an interleaved access group (default = 8)"),
38 : cl::init(8));
39 :
40 : /// Identify if the intrinsic is trivially vectorizable.
41 : /// This method returns true if the intrinsic's argument types are all
42 : /// scalars for the scalar form of the intrinsic and all vectors for
43 : /// the vector form of the intrinsic.
44 14645 : bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
45 14645 : switch (ID) {
46 : case Intrinsic::sqrt:
47 : case Intrinsic::sin:
48 : case Intrinsic::cos:
49 : case Intrinsic::exp:
50 : case Intrinsic::exp2:
51 : case Intrinsic::log:
52 : case Intrinsic::log10:
53 : case Intrinsic::log2:
54 : case Intrinsic::fabs:
55 : case Intrinsic::minnum:
56 : case Intrinsic::maxnum:
57 : case Intrinsic::copysign:
58 : case Intrinsic::floor:
59 : case Intrinsic::ceil:
60 : case Intrinsic::trunc:
61 : case Intrinsic::rint:
62 : case Intrinsic::nearbyint:
63 : case Intrinsic::round:
64 : case Intrinsic::bswap:
65 : case Intrinsic::bitreverse:
66 : case Intrinsic::ctpop:
67 : case Intrinsic::pow:
68 : case Intrinsic::fma:
69 : case Intrinsic::fmuladd:
70 : case Intrinsic::ctlz:
71 : case Intrinsic::cttz:
72 : case Intrinsic::powi:
73 : case Intrinsic::canonicalize:
74 : return true;
75 4795 : default:
76 4795 : return false;
77 : }
78 : }
79 :
80 : /// Identifies if the intrinsic has a scalar operand. It check for
81 : /// ctlz,cttz and powi special intrinsics whose argument is scalar.
82 9724 : bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
83 : unsigned ScalarOpdIdx) {
84 9724 : switch (ID) {
85 2161 : case Intrinsic::ctlz:
86 : case Intrinsic::cttz:
87 : case Intrinsic::powi:
88 2161 : return (ScalarOpdIdx == 1);
89 : default:
90 : return false;
91 : }
92 : }
93 :
94 : /// Returns intrinsic ID for call.
95 : /// For the input call instruction it finds mapping intrinsic and returns
96 : /// its ID, in case it does not found it return not_intrinsic.
97 14331 : Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
98 : const TargetLibraryInfo *TLI) {
99 14331 : Intrinsic::ID ID = getIntrinsicForCallSite(CI, TLI);
100 14331 : if (ID == Intrinsic::not_intrinsic)
101 : return Intrinsic::not_intrinsic;
102 :
103 18369 : if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
104 13575 : ID == Intrinsic::lifetime_end || ID == Intrinsic::assume ||
105 4717 : ID == Intrinsic::sideeffect)
106 8890 : return ID;
107 : return Intrinsic::not_intrinsic;
108 : }
109 :
110 : /// Find the operand of the GEP that should be checked for consecutive
111 : /// stores. This ignores trailing indices that have no effect on the final
112 : /// pointer.
113 4146 : unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
114 4146 : const DataLayout &DL = Gep->getModule()->getDataLayout();
115 4146 : unsigned LastOperand = Gep->getNumOperands() - 1;
116 4146 : unsigned GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
117 :
118 : // Walk backwards and try to peel off zeros.
119 5908 : while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
120 : // Find the type we're currently indexing into.
121 402 : gep_type_iterator GEPTI = gep_type_begin(Gep);
122 402 : std::advance(GEPTI, LastOperand - 2);
123 :
124 : // If it's a type with the same allocation size as the result of the GEP we
125 : // can peel off the zero index.
126 402 : if (DL.getTypeAllocSize(GEPTI.getIndexedType()) != GEPAllocSize)
127 : break;
128 173 : --LastOperand;
129 : }
130 :
131 4146 : return LastOperand;
132 : }
133 :
134 : /// If the argument is a GEP, then returns the operand identified by
135 : /// getGEPInductionOperand. However, if there is some other non-loop-invariant
136 : /// operand, it returns that instead.
137 9454 : Value *llvm::stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
138 : GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
139 : if (!GEP)
140 : return Ptr;
141 :
142 4146 : unsigned InductionOperand = getGEPInductionOperand(GEP);
143 :
144 : // Check that all of the gep indices are uniform except for our induction
145 : // operand.
146 12832 : for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
147 16022 : if (i != InductionOperand &&
148 6404 : !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
149 : return Ptr;
150 : return GEP->getOperand(InductionOperand);
151 : }
152 :
153 : /// If a value has only one user that is a CastInst, return it.
154 1 : Value *llvm::getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
155 : Value *UniqueCast = nullptr;
156 3 : for (User *U : Ptr->users()) {
157 : CastInst *CI = dyn_cast<CastInst>(U);
158 1 : if (CI && CI->getType() == Ty) {
159 1 : if (!UniqueCast)
160 : UniqueCast = CI;
161 : else
162 : return nullptr;
163 : }
164 : }
165 : return UniqueCast;
166 : }
167 :
168 : /// Get the stride of a pointer access in a loop. Looks for symbolic
169 : /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
170 9454 : Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
171 9454 : auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
172 : if (!PtrTy || PtrTy->isAggregateType())
173 : return nullptr;
174 :
175 : // Try to remove a gep instruction to make the pointer (actually index at this
176 : // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
177 : // pointer, otherwise, we are analyzing the index.
178 : Value *OrigPtr = Ptr;
179 :
180 : // The size of the pointer access.
181 : int64_t PtrAccessSize = 1;
182 :
183 9454 : Ptr = stripGetElementPtr(Ptr, SE, Lp);
184 9454 : const SCEV *V = SE->getSCEV(Ptr);
185 :
186 9454 : if (Ptr != OrigPtr)
187 : // Strip off casts.
188 : while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
189 103 : V = C->getOperand();
190 :
191 : const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
192 : if (!S)
193 : return nullptr;
194 :
195 4579 : V = S->getStepRecurrence(*SE);
196 4579 : if (!V)
197 : return nullptr;
198 :
199 : // Strip off the size of access multiplication if we are still analyzing the
200 : // pointer.
201 4579 : if (OrigPtr == Ptr) {
202 : if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
203 18 : if (M->getOperand(0)->getSCEVType() != scConstant)
204 : return nullptr;
205 :
206 : const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
207 :
208 : // Huge step value - give up.
209 9 : if (APStepVal.getBitWidth() > 64)
210 : return nullptr;
211 :
212 : int64_t StepVal = APStepVal.getSExtValue();
213 9 : if (PtrAccessSize != StepVal)
214 : return nullptr;
215 : V = M->getOperand(1);
216 : }
217 : }
218 :
219 : // Strip off casts.
220 : Type *StripedOffRecurrenceCast = nullptr;
221 : if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
222 1 : StripedOffRecurrenceCast = C->getType();
223 1 : V = C->getOperand();
224 : }
225 :
226 : // Look for the loop invariant symbolic value.
227 : const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
228 15 : if (!U)
229 : return nullptr;
230 :
231 : Value *Stride = U->getValue();
232 15 : if (!Lp->isLoopInvariant(Stride))
233 : return nullptr;
234 :
235 : // If we have stripped off the recurrence cast we have to make sure that we
236 : // return the value that is used in this loop so that we can replace it later.
237 15 : if (StripedOffRecurrenceCast)
238 1 : Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
239 :
240 : return Stride;
241 : }
242 :
243 : /// Given a vector and an element number, see if the scalar value is
244 : /// already around as a register, for example if it were inserted then extracted
245 : /// from the vector.
246 28517 : Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
247 : assert(V->getType()->isVectorTy() && "Not looking at a vector?");
248 28517 : VectorType *VTy = cast<VectorType>(V->getType());
249 28517 : unsigned Width = VTy->getNumElements();
250 28517 : if (EltNo >= Width) // Out of range access.
251 0 : return UndefValue::get(VTy->getElementType());
252 :
253 : if (Constant *C = dyn_cast<Constant>(V))
254 18 : return C->getAggregateElement(EltNo);
255 :
256 : if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
257 : // If this is an insert to a variable element, we don't know what it is.
258 3597 : if (!isa<ConstantInt>(III->getOperand(2)))
259 : return nullptr;
260 459 : unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
261 :
262 : // If this is an insert to the element we are looking for, return the
263 : // inserted value.
264 459 : if (EltNo == IIElt)
265 : return III->getOperand(1);
266 :
267 : // Otherwise, the insertelement doesn't modify the value, recurse on its
268 : // vector input.
269 214 : return findScalarElement(III->getOperand(0), EltNo);
270 : }
271 :
272 : if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
273 646 : unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
274 : int InEl = SVI->getMaskValue(EltNo);
275 646 : if (InEl < 0)
276 0 : return UndefValue::get(VTy->getElementType());
277 646 : if (InEl < (int)LHSWidth)
278 403 : return findScalarElement(SVI->getOperand(0), InEl);
279 243 : return findScalarElement(SVI->getOperand(1), InEl - LHSWidth);
280 : }
281 :
282 : // Extract a value from a vector add operation with a constant zero.
283 : // TODO: Use getBinOpIdentity() to generalize this.
284 : Value *Val; Constant *C;
285 24256 : if (match(V, m_Add(m_Value(Val), m_Constant(C))))
286 493 : if (Constant *Elt = C->getAggregateElement(EltNo))
287 492 : if (Elt->isNullValue())
288 4 : return findScalarElement(Val, EltNo);
289 :
290 : // Otherwise, we don't know.
291 : return nullptr;
292 : }
293 :
294 : /// Get splat value if the input is a splat vector or return nullptr.
295 : /// This function is not fully general. It checks only 2 cases:
296 : /// the input value is (1) a splat constants vector or (2) a sequence
297 : /// of instructions that broadcast a single value into a vector.
298 : ///
299 152745 : const llvm::Value *llvm::getSplatValue(const Value *V) {
300 :
301 : if (auto *C = dyn_cast<Constant>(V))
302 25692 : if (isa<VectorType>(V->getType()))
303 10812 : return C->getSplatValue();
304 :
305 : auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V);
306 : if (!ShuffleInst)
307 : return nullptr;
308 : // All-zero (or undef) shuffle mask elements.
309 12208 : for (int MaskElt : ShuffleInst->getShuffleMask())
310 11509 : if (MaskElt != 0 && MaskElt != -1)
311 : return nullptr;
312 : // The first shuffle source is 'insertelement' with index 0.
313 : auto *InsertEltInst =
314 : dyn_cast<InsertElementInst>(ShuffleInst->getOperand(0));
315 1288 : if (!InsertEltInst || !isa<ConstantInt>(InsertEltInst->getOperand(2)) ||
316 : !cast<ConstantInt>(InsertEltInst->getOperand(2))->isZero())
317 : return nullptr;
318 :
319 : return InsertEltInst->getOperand(1);
320 : }
321 :
322 : MapVector<Instruction *, uint64_t>
323 983 : llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
324 : const TargetTransformInfo *TTI) {
325 :
326 : // DemandedBits will give us every value's live-out bits. But we want
327 : // to ensure no extra casts would need to be inserted, so every DAG
328 : // of connected values must have the same minimum bitwidth.
329 : EquivalenceClasses<Value *> ECs;
330 : SmallVector<Value *, 16> Worklist;
331 : SmallPtrSet<Value *, 4> Roots;
332 : SmallPtrSet<Value *, 16> Visited;
333 : DenseMap<Value *, uint64_t> DBits;
334 : SmallPtrSet<Instruction *, 4> InstructionSet;
335 983 : MapVector<Instruction *, uint64_t> MinBWs;
336 :
337 : // Determine the roots. We work bottom-up, from truncs or icmps.
338 : bool SeenExtFromIllegalType = false;
339 2321 : for (auto *BB : Blocks)
340 13630 : for (auto &I : *BB) {
341 12292 : InstructionSet.insert(&I);
342 :
343 12480 : if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) &&
344 376 : !TTI->isTypeLegal(I.getOperand(0)->getType()))
345 : SeenExtFromIllegalType = true;
346 :
347 : // Only deal with non-vector integers up to 64-bits wide.
348 12292 : if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) &&
349 13466 : !I.getType()->isVectorTy() &&
350 3178 : I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
351 : // Don't make work for ourselves. If we know the loaded type is legal,
352 : // don't add it to the worklist.
353 1584 : if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType()))
354 : continue;
355 :
356 1465 : Worklist.push_back(&I);
357 1465 : Roots.insert(&I);
358 : }
359 : }
360 : // Early exit.
361 983 : if (Worklist.empty() || (TTI && !SeenExtFromIllegalType))
362 : return MinBWs;
363 :
364 : // Now proceed breadth-first, unioning values together.
365 811 : while (!Worklist.empty()) {
366 : Value *Val = Worklist.pop_back_val();
367 701 : Value *Leader = ECs.getOrInsertLeaderValue(Val);
368 :
369 701 : if (Visited.count(Val))
370 415 : continue;
371 633 : Visited.insert(Val);
372 :
373 : // Non-instructions terminate a chain successfully.
374 633 : if (!isa<Instruction>(Val))
375 : continue;
376 : Instruction *I = cast<Instruction>(Val);
377 :
378 : // If we encounter a type that is larger than 64 bits, we can't represent
379 : // it so bail out.
380 464 : if (DB.getDemandedBits(I).getBitWidth() > 64)
381 0 : return MapVector<Instruction *, uint64_t>();
382 :
383 928 : uint64_t V = DB.getDemandedBits(I).getZExtValue();
384 464 : DBits[Leader] |= V;
385 464 : DBits[I] = V;
386 :
387 : // Casts, loads and instructions outside of our range terminate a chain
388 : // successfully.
389 881 : if (isa<SExtInst>(I) || isa<ZExtInst>(I) || isa<LoadInst>(I) ||
390 417 : !InstructionSet.count(I))
391 54 : continue;
392 :
393 : // Unsafe casts terminate a chain unsuccessfully. We can't do anything
394 : // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
395 : // transform anything that relies on them.
396 410 : if (isa<BitCastInst>(I) || isa<PtrToIntInst>(I) || isa<IntToPtrInst>(I) ||
397 410 : !I->getType()->isIntegerTy()) {
398 6 : DBits[Leader] |= ~0ULL;
399 6 : continue;
400 : }
401 :
402 : // We don't modify the types of PHIs. Reductions will already have been
403 : // truncated if possible, and inductions' sizes will have been chosen by
404 : // indvars.
405 404 : if (isa<PHINode>(I))
406 : continue;
407 :
408 340 : if (DBits[Leader] == ~0ULL)
409 : // All bits demanded, no point continuing.
410 : continue;
411 :
412 790 : for (Value *O : cast<User>(I)->operands()) {
413 504 : ECs.unionSets(Leader, O);
414 504 : Worklist.push_back(O);
415 : }
416 : }
417 :
418 : // Now we've discovered all values, walk them to see if there are
419 : // any users we didn't see. If there are, we can't optimize that
420 : // chain.
421 574 : for (auto &I : DBits)
422 1169 : for (auto *U : I.first->users())
423 1774 : if (U->getType()->isIntegerTy() && DBits.count(U) == 0)
424 322 : DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL;
425 :
426 743 : for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) {
427 : uint64_t LeaderDemandedBits = 0;
428 1266 : for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI)
429 633 : LeaderDemandedBits |= DBits[*MI];
430 :
431 : uint64_t MinBW = (sizeof(LeaderDemandedBits) * 8) -
432 633 : llvm::countLeadingZeros(LeaderDemandedBits);
433 : // Round up to a power of 2
434 : if (!isPowerOf2_64((uint64_t)MinBW))
435 : MinBW = NextPowerOf2(MinBW);
436 :
437 : // We don't modify the types of PHIs. Reductions will already have been
438 : // truncated if possible, and inductions' sizes will have been chosen by
439 : // indvars.
440 : // If we are required to shrink a PHI, abandon this entire equivalence class.
441 : bool Abort = false;
442 1266 : for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI)
443 698 : if (isa<PHINode>(*MI) && MinBW < (*MI)->getType()->getScalarSizeInBits()) {
444 : Abort = true;
445 : break;
446 : }
447 633 : if (Abort)
448 : continue;
449 :
450 1266 : for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) {
451 1266 : if (!isa<Instruction>(*MI))
452 : continue;
453 464 : Type *Ty = (*MI)->getType();
454 464 : if (Roots.count(*MI))
455 394 : Ty = cast<Instruction>(*MI)->getOperand(0)->getType();
456 464 : if (MinBW < Ty->getScalarSizeInBits())
457 58 : MinBWs[cast<Instruction>(*MI)] = MinBW;
458 : }
459 : }
460 :
461 : return MinBWs;
462 : }
463 :
464 : /// \returns \p I after propagating metadata from \p VL.
465 295931 : Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
466 295931 : Instruction *I0 = cast<Instruction>(VL[0]);
467 : SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
468 : I0->getAllMetadataOtherThanDebugLoc(Metadata);
469 :
470 3551172 : for (auto Kind :
471 : {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
472 : LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
473 2071517 : LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load}) {
474 : MDNode *MD = I0->getMetadata(Kind);
475 :
476 1798636 : for (int J = 1, E = VL.size(); MD && J != E; ++J) {
477 46100 : const Instruction *IJ = cast<Instruction>(VL[J]);
478 : MDNode *IMD = IJ->getMetadata(Kind);
479 23050 : switch (Kind) {
480 787 : case LLVMContext::MD_tbaa:
481 787 : MD = MDNode::getMostGenericTBAA(MD, IMD);
482 787 : break;
483 9 : case LLVMContext::MD_alias_scope:
484 9 : MD = MDNode::getMostGenericAliasScope(MD, IMD);
485 9 : break;
486 2 : case LLVMContext::MD_fpmath:
487 2 : MD = MDNode::getMostGenericFPMath(MD, IMD);
488 2 : break;
489 22252 : case LLVMContext::MD_noalias:
490 : case LLVMContext::MD_nontemporal:
491 : case LLVMContext::MD_invariant_load:
492 22252 : MD = MDNode::intersect(MD, IMD);
493 22252 : break;
494 0 : default:
495 0 : llvm_unreachable("unhandled metadata");
496 : }
497 : }
498 :
499 1775586 : Inst->setMetadata(Kind, MD);
500 : }
501 :
502 295931 : return Inst;
503 : }
504 :
505 3 : Constant *llvm::createReplicatedMask(IRBuilder<> &Builder,
506 : unsigned ReplicationFactor, unsigned VF) {
507 : SmallVector<Constant *, 16> MaskVec;
508 27 : for (unsigned i = 0; i < VF; i++)
509 72 : for (unsigned j = 0; j < ReplicationFactor; j++)
510 48 : MaskVec.push_back(Builder.getInt32(i));
511 :
512 3 : return ConstantVector::get(MaskVec);
513 : }
514 :
515 18 : Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
516 : unsigned NumVecs) {
517 : SmallVector<Constant *, 16> Mask;
518 110 : for (unsigned i = 0; i < VF; i++)
519 292 : for (unsigned j = 0; j < NumVecs; j++)
520 200 : Mask.push_back(Builder.getInt32(j * VF + i));
521 :
522 18 : return ConstantVector::get(Mask);
523 : }
524 :
525 79 : Constant *llvm::createStrideMask(IRBuilder<> &Builder, unsigned Start,
526 : unsigned Stride, unsigned VF) {
527 : SmallVector<Constant *, 16> Mask;
528 467 : for (unsigned i = 0; i < VF; i++)
529 388 : Mask.push_back(Builder.getInt32(Start + i * Stride));
530 :
531 79 : return ConstantVector::get(Mask);
532 : }
533 :
534 460 : Constant *llvm::createSequentialMask(IRBuilder<> &Builder, unsigned Start,
535 : unsigned NumInts, unsigned NumUndefs) {
536 : SmallVector<Constant *, 16> Mask;
537 11628 : for (unsigned i = 0; i < NumInts; i++)
538 11168 : Mask.push_back(Builder.getInt32(Start + i));
539 :
540 460 : Constant *Undef = UndefValue::get(Builder.getInt32Ty());
541 932 : for (unsigned i = 0; i < NumUndefs; i++)
542 472 : Mask.push_back(Undef);
543 :
544 460 : return ConstantVector::get(Mask);
545 : }
546 :
547 : /// A helper function for concatenating vectors. This function concatenates two
548 : /// vectors having the same element type. If the second vector has fewer
549 : /// elements than the first, it is padded with undefs.
550 146 : static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
551 : Value *V2) {
552 146 : VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
553 146 : VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
554 : assert(VecTy1 && VecTy2 &&
555 : VecTy1->getScalarType() == VecTy2->getScalarType() &&
556 : "Expect two vectors with the same element type");
557 :
558 146 : unsigned NumElts1 = VecTy1->getNumElements();
559 146 : unsigned NumElts2 = VecTy2->getNumElements();
560 : assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
561 :
562 146 : if (NumElts1 > NumElts2) {
563 : // Extend with UNDEFs.
564 : Constant *ExtMask =
565 18 : createSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
566 18 : V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
567 : }
568 :
569 146 : Constant *Mask = createSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
570 146 : return Builder.CreateShuffleVector(V1, V2, Mask);
571 : }
572 :
573 82 : Value *llvm::concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) {
574 82 : unsigned NumVecs = Vecs.size();
575 : assert(NumVecs > 1 && "Should be at least two vectors");
576 :
577 : SmallVector<Value *, 8> ResList;
578 82 : ResList.append(Vecs.begin(), Vecs.end());
579 : do {
580 : SmallVector<Value *, 8> TmpList;
581 269 : for (unsigned i = 0; i < NumVecs - 1; i += 2) {
582 292 : Value *V0 = ResList[i], *V1 = ResList[i + 1];
583 : assert((V0->getType() == V1->getType() || i == NumVecs - 2) &&
584 : "Only the last vector may have a different type");
585 :
586 146 : TmpList.push_back(concatenateTwoVectors(Builder, V0, V1));
587 : }
588 :
589 : // Push the last vector if the total number of vectors is odd.
590 123 : if (NumVecs % 2 != 0)
591 36 : TmpList.push_back(ResList[NumVecs - 1]);
592 :
593 : ResList = TmpList;
594 123 : NumVecs = ResList.size();
595 123 : } while (NumVecs > 1);
596 :
597 82 : return ResList[0];
598 : }
599 :
600 2671 : bool InterleavedAccessInfo::isStrided(int Stride) {
601 2671 : unsigned Factor = std::abs(Stride);
602 2671 : return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
603 : }
604 :
605 475 : void InterleavedAccessInfo::collectConstStrideAccesses(
606 : MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
607 : const ValueToValueMap &Strides) {
608 475 : auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
609 :
610 : // Since it's desired that the load/store instructions be maintained in
611 : // "program order" for the interleaved access analysis, we have to visit the
612 : // blocks in the loop in reverse postorder (i.e., in a topological order).
613 : // Such an ordering will ensure that any load/store that may be executed
614 : // before a second load/store will precede the second load/store in
615 : // AccessStrideInfo.
616 950 : LoopBlocksDFS DFS(TheLoop);
617 475 : DFS.perform(LI);
618 1137 : for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
619 7244 : for (auto &I : *BB) {
620 : auto *LI = dyn_cast<LoadInst>(&I);
621 : auto *SI = dyn_cast<StoreInst>(&I);
622 6582 : if (!LI && !SI)
623 : continue;
624 :
625 : Value *Ptr = getLoadStorePointerOperand(&I);
626 : // We don't check wrapping here because we don't know yet if Ptr will be
627 : // part of a full group or a group with gaps. Checking wrapping for all
628 : // pointers (even those that end up in groups with no gaps) will be overly
629 : // conservative. For full groups, wrapping should be ok since if we would
630 : // wrap around the address space we would do a memory access at nullptr
631 : // even without the transformation. The wrapping checks are therefore
632 : // deferred until after we've formed the interleaved groups.
633 1056 : int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
634 : /*Assume=*/true, /*ShouldCheckWrap=*/false);
635 :
636 1056 : const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
637 1056 : PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
638 1056 : uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
639 :
640 : // An alignment of 0 means target ABI alignment.
641 : unsigned Align = getLoadStoreAlignment(&I);
642 1056 : if (!Align)
643 32 : Align = DL.getABITypeAlignment(PtrTy->getElementType());
644 :
645 1056 : AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
646 : }
647 475 : }
648 :
649 : // Analyze interleaved accesses and collect them into interleaved load and
650 : // store groups.
651 : //
652 : // When generating code for an interleaved load group, we effectively hoist all
653 : // loads in the group to the location of the first load in program order. When
654 : // generating code for an interleaved store group, we sink all stores to the
655 : // location of the last store. This code motion can change the order of load
656 : // and store instructions and may break dependences.
657 : //
658 : // The code generation strategy mentioned above ensures that we won't violate
659 : // any write-after-read (WAR) dependences.
660 : //
661 : // E.g., for the WAR dependence: a = A[i]; // (1)
662 : // A[i] = b; // (2)
663 : //
664 : // The store group of (2) is always inserted at or below (2), and the load
665 : // group of (1) is always inserted at or above (1). Thus, the instructions will
666 : // never be reordered. All other dependences are checked to ensure the
667 : // correctness of the instruction reordering.
668 : //
669 : // The algorithm visits all memory accesses in the loop in bottom-up program
670 : // order. Program order is established by traversing the blocks in the loop in
671 : // reverse postorder when collecting the accesses.
672 : //
673 : // We visit the memory accesses in bottom-up order because it can simplify the
674 : // construction of store groups in the presence of write-after-write (WAW)
675 : // dependences.
676 : //
677 : // E.g., for the WAW dependence: A[i] = a; // (1)
678 : // A[i] = b; // (2)
679 : // A[i + 1] = c; // (3)
680 : //
681 : // We will first create a store group with (3) and (2). (1) can't be added to
682 : // this group because it and (2) are dependent. However, (1) can be grouped
683 : // with other accesses that may precede it in program order. Note that a
684 : // bottom-up order does not imply that WAW dependences should not be checked.
685 475 : void InterleavedAccessInfo::analyzeInterleaving(
686 : bool EnablePredicatedInterleavedMemAccesses) {
687 : LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
688 475 : const ValueToValueMap &Strides = LAI->getSymbolicStrides();
689 :
690 : // Holds all accesses with a constant stride.
691 457 : MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
692 475 : collectConstStrideAccesses(AccessStrideInfo, Strides);
693 :
694 475 : if (AccessStrideInfo.empty())
695 18 : return;
696 :
697 : // Collect the dependences in the loop.
698 457 : collectDependences();
699 :
700 : // Holds all interleaved store groups temporarily.
701 : SmallSetVector<InterleaveGroup *, 4> StoreGroups;
702 : // Holds all interleaved load groups temporarily.
703 : SmallSetVector<InterleaveGroup *, 4> LoadGroups;
704 :
705 : // Search in bottom-up program order for pairs of accesses (A and B) that can
706 : // form interleaved load or store groups. In the algorithm below, access A
707 : // precedes access B in program order. We initialize a group for B in the
708 : // outer loop of the algorithm, and then in the inner loop, we attempt to
709 : // insert each A into B's group if:
710 : //
711 : // 1. A and B have the same stride,
712 : // 2. A and B have the same memory object size, and
713 : // 3. A belongs in B's group according to its distance from B.
714 : //
715 : // Special care is taken to ensure group formation will not break any
716 : // dependences.
717 : for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
718 1513 : BI != E; ++BI) {
719 1056 : Instruction *B = BI->first;
720 1056 : StrideDescriptor DesB = BI->second;
721 :
722 : // Initialize a group for B if it has an allowable stride. Even if we don't
723 : // create a group for B, we continue with the bottom-up algorithm to ensure
724 : // we don't break any of B's dependences.
725 1056 : InterleaveGroup *Group = nullptr;
726 1236 : if (isStrided(DesB.Stride) &&
727 211 : (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
728 157 : Group = getInterleaveGroup(B);
729 157 : if (!Group) {
730 : LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
731 : << '\n');
732 99 : Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
733 : }
734 157 : if (B->mayWriteToMemory())
735 64 : StoreGroups.insert(Group);
736 : else
737 93 : LoadGroups.insert(Group);
738 : }
739 :
740 2225 : for (auto AI = std::next(BI); AI != E; ++AI) {
741 1176 : Instruction *A = AI->first;
742 1176 : StrideDescriptor DesA = AI->second;
743 :
744 : // Our code motion strategy implies that we can't have dependences
745 : // between accesses in an interleaved group and other accesses located
746 : // between the first and last member of the group. Note that this also
747 : // means that a group can't have more than one member at a given offset.
748 : // The accesses in a group can have dependences with other accesses, but
749 : // we must ensure we don't extend the boundaries of the group such that
750 : // we encompass those dependent accesses.
751 : //
752 : // For example, assume we have the sequence of accesses shown below in a
753 : // stride-2 loop:
754 : //
755 : // (1, 2) is a group | A[i] = a; // (1)
756 : // | A[i-1] = b; // (2) |
757 : // A[i-3] = c; // (3)
758 : // A[i] = d; // (4) | (2, 4) is not a group
759 : //
760 : // Because accesses (2) and (3) are dependent, we can group (2) with (1)
761 : // but not with (4). If we did, the dependent access (3) would be within
762 : // the boundaries of the (2, 4) group.
763 1176 : if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
764 : // If a dependence exists and A is already in a group, we know that A
765 : // must be a store since A precedes B and WAR dependences are allowed.
766 : // Thus, A would be sunk below B. We release A's group to prevent this
767 : // illegal code motion. A will then be free to form another group with
768 : // instructions that precede it.
769 7 : if (isInterleaved(A)) {
770 3 : InterleaveGroup *StoreGroup = getInterleaveGroup(A);
771 3 : StoreGroups.remove(StoreGroup);
772 3 : releaseGroup(StoreGroup);
773 : }
774 :
775 : // If a dependence exists and A is not already in a group (or it was
776 : // and we just released it), B might be hoisted above A (if B is a
777 : // load) or another store might be sunk below A (if B is a store). In
778 : // either case, we can't add additional instructions to B's group. B
779 : // will only form a group with instructions that it precedes.
780 7 : break;
781 : }
782 :
783 : // At this point, we've checked for illegal code motion. If either A or B
784 : // isn't strided, there's nothing left to do.
785 1169 : if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
786 1101 : continue;
787 :
788 : // Ignore A if it's already in a group or isn't the same kind of memory
789 : // operation as B.
790 : // Note that mayReadFromMemory() isn't mutually exclusive to
791 : // mayWriteToMemory in the case of atomic loads. We shouldn't see those
792 : // here, canVectorizeMemory() should have returned false - except for the
793 : // case we asked for optimization remarks.
794 374 : if (isInterleaved(A) ||
795 302 : (A->mayReadFromMemory() != B->mayReadFromMemory()) ||
796 94 : (A->mayWriteToMemory() != B->mayWriteToMemory()))
797 114 : continue;
798 :
799 : // Check rules 1 and 2. Ignore A if its stride or size is different from
800 : // that of B.
801 94 : if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
802 : continue;
803 :
804 : // Ignore A if the memory object of A and B don't belong to the same
805 : // address space
806 94 : if (getLoadStoreAddressSpace(A) != getLoadStoreAddressSpace(B))
807 : continue;
808 :
809 : // Calculate the distance from A to B.
810 93 : const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
811 93 : PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
812 : if (!DistToB)
813 : continue;
814 : int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
815 :
816 : // Check rule 3. Ignore A if its distance to B is not a multiple of the
817 : // size.
818 89 : if (DistanceToB % static_cast<int64_t>(DesB.Size))
819 : continue;
820 :
821 : // All members of a predicated interleave-group must have the same predicate,
822 : // and currently must reside in the same BB.
823 89 : BasicBlock *BlockA = A->getParent();
824 89 : BasicBlock *BlockB = B->getParent();
825 89 : if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
826 23 : (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
827 : continue;
828 :
829 : // The index of A is the index of B plus A's distance to B in multiples
830 : // of the size.
831 : int IndexA =
832 68 : Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
833 :
834 : // Try to insert A into B's group.
835 68 : if (Group->insertMember(A, IndexA, DesA.Align)) {
836 : LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
837 : << " into the interleave group with" << *B
838 : << '\n');
839 61 : InterleaveGroupMap[A] = Group;
840 :
841 : // Set the first load in program order as the insert position.
842 61 : if (A->mayReadFromMemory())
843 36 : Group->setInsertPos(A);
844 : }
845 : } // Iteration over A accesses.
846 : } // Iteration over B accesses.
847 :
848 : // Remove interleaved store groups with gaps.
849 496 : for (InterleaveGroup *Group : StoreGroups)
850 39 : if (Group->getNumMembers() != Group->getFactor()) {
851 : LLVM_DEBUG(
852 : dbgs() << "LV: Invalidate candidate interleaved store group due "
853 : "to gaps.\n");
854 22 : releaseGroup(Group);
855 : }
856 : // Remove interleaved groups with gaps (currently only loads) whose memory
857 : // accesses may wrap around. We have to revisit the getPtrStride analysis,
858 : // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
859 : // not check wrapping (see documentation there).
860 : // FORNOW we use Assume=false;
861 : // TODO: Change to Assume=true but making sure we don't exceed the threshold
862 : // of runtime SCEV assumptions checks (thereby potentially failing to
863 : // vectorize altogether).
864 : // Additional optional optimizations:
865 : // TODO: If we are peeling the loop and we know that the first pointer doesn't
866 : // wrap then we can deduce that all pointers in the group don't wrap.
867 : // This means that we can forcefully peel the loop in order to only have to
868 : // check the first pointer for no-wrap. When we'll change to use Assume=true
869 : // we'll only need at most one runtime check per interleaved group.
870 514 : for (InterleaveGroup *Group : LoadGroups) {
871 : // Case 1: A full group. Can Skip the checks; For full groups, if the wide
872 : // load would wrap around the address space we would do a memory access at
873 : // nullptr even without the transformation.
874 57 : if (Group->getNumMembers() == Group->getFactor())
875 : continue;
876 :
877 : // Case 2: If first and last members of the group don't wrap this implies
878 : // that all the pointers in the group don't wrap.
879 : // So we check only group member 0 (which is always guaranteed to exist),
880 : // and group member Factor - 1; If the latter doesn't exist we rely on
881 : // peeling (if it is a non-reveresed accsess -- see Case 3).
882 32 : Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
883 32 : if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
884 : /*ShouldCheckWrap=*/true)) {
885 : LLVM_DEBUG(
886 : dbgs() << "LV: Invalidate candidate interleaved group due to "
887 : "first group member potentially pointer-wrapping.\n");
888 5 : releaseGroup(Group);
889 5 : continue;
890 : }
891 27 : Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
892 27 : if (LastMember) {
893 : Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
894 0 : if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
895 : /*ShouldCheckWrap=*/true)) {
896 : LLVM_DEBUG(
897 : dbgs() << "LV: Invalidate candidate interleaved group due to "
898 : "last group member potentially pointer-wrapping.\n");
899 0 : releaseGroup(Group);
900 : }
901 : } else {
902 : // Case 3: A non-reversed interleaved load group with gaps: We need
903 : // to execute at least one scalar epilogue iteration. This will ensure
904 : // we don't speculatively access memory out-of-bounds. We only need
905 : // to look for a member at index factor - 1, since every group must have
906 : // a member at index zero.
907 27 : if (Group->isReverse()) {
908 : LLVM_DEBUG(
909 : dbgs() << "LV: Invalidate candidate interleaved group due to "
910 : "a reverse access with gaps.\n");
911 1 : releaseGroup(Group);
912 1 : continue;
913 : }
914 : LLVM_DEBUG(
915 : dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
916 26 : RequiresScalarEpilogue = true;
917 : }
918 : }
919 : }
|