LLVM 22.0.0git
VectorUtils.cpp
Go to the documentation of this file.
1//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines vectorizer utilities.
10//
11//===----------------------------------------------------------------------===//
12
23#include "llvm/IR/Constants.h"
25#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Value.h"
30
31#define DEBUG_TYPE "vectorutils"
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36/// Maximum factor for an interleaved memory access.
38 "max-interleave-group-factor", cl::Hidden,
39 cl::desc("Maximum factor for an interleaved access group (default = 8)"),
40 cl::init(8));
41
42/// Return true if all of the intrinsic's arguments and return type are scalars
43/// for the scalar form of the intrinsic, and vectors for the vector form of the
44/// intrinsic (except operands that are marked as always being scalar by
45/// isVectorIntrinsicWithScalarOpAtArg).
47 switch (ID) {
48 case Intrinsic::abs: // Begin integer bit-manipulation.
49 case Intrinsic::bswap:
50 case Intrinsic::bitreverse:
51 case Intrinsic::ctpop:
52 case Intrinsic::ctlz:
53 case Intrinsic::cttz:
54 case Intrinsic::fshl:
55 case Intrinsic::fshr:
56 case Intrinsic::smax:
57 case Intrinsic::smin:
58 case Intrinsic::umax:
59 case Intrinsic::umin:
60 case Intrinsic::sadd_sat:
61 case Intrinsic::ssub_sat:
62 case Intrinsic::uadd_sat:
63 case Intrinsic::usub_sat:
64 case Intrinsic::smul_fix:
65 case Intrinsic::smul_fix_sat:
66 case Intrinsic::umul_fix:
67 case Intrinsic::umul_fix_sat:
68 case Intrinsic::sqrt: // Begin floating-point.
69 case Intrinsic::asin:
70 case Intrinsic::acos:
71 case Intrinsic::atan:
72 case Intrinsic::atan2:
73 case Intrinsic::sin:
74 case Intrinsic::cos:
75 case Intrinsic::sincos:
76 case Intrinsic::sincospi:
77 case Intrinsic::tan:
78 case Intrinsic::sinh:
79 case Intrinsic::cosh:
80 case Intrinsic::tanh:
81 case Intrinsic::exp:
82 case Intrinsic::exp10:
83 case Intrinsic::exp2:
84 case Intrinsic::ldexp:
85 case Intrinsic::log:
86 case Intrinsic::log10:
87 case Intrinsic::log2:
88 case Intrinsic::fabs:
89 case Intrinsic::minnum:
90 case Intrinsic::maxnum:
91 case Intrinsic::minimum:
92 case Intrinsic::maximum:
93 case Intrinsic::minimumnum:
94 case Intrinsic::maximumnum:
95 case Intrinsic::modf:
96 case Intrinsic::copysign:
97 case Intrinsic::floor:
98 case Intrinsic::ceil:
99 case Intrinsic::trunc:
100 case Intrinsic::rint:
101 case Intrinsic::nearbyint:
102 case Intrinsic::round:
103 case Intrinsic::roundeven:
104 case Intrinsic::pow:
105 case Intrinsic::fma:
106 case Intrinsic::fmuladd:
107 case Intrinsic::is_fpclass:
108 case Intrinsic::powi:
109 case Intrinsic::canonicalize:
110 case Intrinsic::fptosi_sat:
111 case Intrinsic::fptoui_sat:
112 case Intrinsic::lround:
113 case Intrinsic::llround:
114 case Intrinsic::lrint:
115 case Intrinsic::llrint:
116 case Intrinsic::ucmp:
117 case Intrinsic::scmp:
118 return true;
119 default:
120 return false;
121 }
122}
123
125 const TargetTransformInfo *TTI) {
127 return true;
128
130 return TTI->isTargetIntrinsicTriviallyScalarizable(ID);
131
132 // TODO: Move frexp to isTriviallyVectorizable.
133 // https://github.com/llvm/llvm-project/issues/112408
134 switch (ID) {
135 case Intrinsic::frexp:
136 case Intrinsic::uadd_with_overflow:
137 case Intrinsic::sadd_with_overflow:
138 case Intrinsic::ssub_with_overflow:
139 case Intrinsic::usub_with_overflow:
140 case Intrinsic::umul_with_overflow:
141 case Intrinsic::smul_with_overflow:
142 return true;
143 }
144 return false;
145}
146
147/// Identifies if the vector form of the intrinsic has a scalar operand.
149 unsigned ScalarOpdIdx,
150 const TargetTransformInfo *TTI) {
151
153 return TTI->isTargetIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx);
154
155 // Vector predication intrinsics have the EVL as the last operand.
156 if (VPIntrinsic::getVectorLengthParamPos(ID) == ScalarOpdIdx)
157 return true;
158
159 switch (ID) {
160 case Intrinsic::abs:
161 case Intrinsic::vp_abs:
162 case Intrinsic::ctlz:
163 case Intrinsic::vp_ctlz:
164 case Intrinsic::cttz:
165 case Intrinsic::vp_cttz:
166 case Intrinsic::is_fpclass:
167 case Intrinsic::vp_is_fpclass:
168 case Intrinsic::powi:
169 case Intrinsic::vector_extract:
170 return (ScalarOpdIdx == 1);
171 case Intrinsic::smul_fix:
172 case Intrinsic::smul_fix_sat:
173 case Intrinsic::umul_fix:
174 case Intrinsic::umul_fix_sat:
175 return (ScalarOpdIdx == 2);
176 case Intrinsic::experimental_vp_splice:
177 return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
178 default:
179 return false;
180 }
181}
182
184 Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI) {
185 assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
186
188 return TTI->isTargetIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
189
191 return OpdIdx == -1 || OpdIdx == 0;
192
193 switch (ID) {
194 case Intrinsic::fptosi_sat:
195 case Intrinsic::fptoui_sat:
196 case Intrinsic::lround:
197 case Intrinsic::llround:
198 case Intrinsic::lrint:
199 case Intrinsic::llrint:
200 case Intrinsic::vp_lrint:
201 case Intrinsic::vp_llrint:
202 case Intrinsic::ucmp:
203 case Intrinsic::scmp:
204 case Intrinsic::vector_extract:
205 return OpdIdx == -1 || OpdIdx == 0;
206 case Intrinsic::modf:
207 case Intrinsic::sincos:
208 case Intrinsic::sincospi:
209 case Intrinsic::is_fpclass:
210 case Intrinsic::vp_is_fpclass:
211 return OpdIdx == 0;
212 case Intrinsic::powi:
213 case Intrinsic::ldexp:
214 return OpdIdx == -1 || OpdIdx == 1;
215 default:
216 return OpdIdx == -1;
217 }
218}
219
221 Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI) {
222
224 return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
225
226 switch (ID) {
227 case Intrinsic::frexp:
228 return RetIdx == 0 || RetIdx == 1;
229 default:
230 return RetIdx == 0;
231 }
232}
233
234/// Returns intrinsic ID for call.
235/// For the input call instruction it finds mapping intrinsic and returns
236/// its ID, in case it does not found it return not_intrinsic.
238 const TargetLibraryInfo *TLI) {
242
243 if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
244 ID == Intrinsic::lifetime_end || ID == Intrinsic::assume ||
245 ID == Intrinsic::experimental_noalias_scope_decl ||
246 ID == Intrinsic::sideeffect || ID == Intrinsic::pseudoprobe)
247 return ID;
249}
250
252 switch (ID) {
253 case Intrinsic::vector_interleave2:
254 return 2;
255 case Intrinsic::vector_interleave3:
256 return 3;
257 case Intrinsic::vector_interleave4:
258 return 4;
259 case Intrinsic::vector_interleave5:
260 return 5;
261 case Intrinsic::vector_interleave6:
262 return 6;
263 case Intrinsic::vector_interleave7:
264 return 7;
265 case Intrinsic::vector_interleave8:
266 return 8;
267 default:
268 return 0;
269 }
270}
271
273 switch (ID) {
274 case Intrinsic::vector_deinterleave2:
275 return 2;
276 case Intrinsic::vector_deinterleave3:
277 return 3;
278 case Intrinsic::vector_deinterleave4:
279 return 4;
280 case Intrinsic::vector_deinterleave5:
281 return 5;
282 case Intrinsic::vector_deinterleave6:
283 return 6;
284 case Intrinsic::vector_deinterleave7:
285 return 7;
286 case Intrinsic::vector_deinterleave8:
287 return 8;
288 default:
289 return 0;
290 }
291}
292
294 [[maybe_unused]] unsigned Factor =
296 ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
297 assert(Factor && Factor == DISubtypes.size() &&
298 "unexpected deinterleave factor or result type");
299 return cast<VectorType>(DISubtypes[0]);
300}
301
302/// Given a vector and an element number, see if the scalar value is
303/// already around as a register, for example if it were inserted then extracted
304/// from the vector.
305Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
306 assert(V->getType()->isVectorTy() && "Not looking at a vector?");
307 VectorType *VTy = cast<VectorType>(V->getType());
308 // For fixed-length vector, return poison for out of range access.
309 if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
310 unsigned Width = FVTy->getNumElements();
311 if (EltNo >= Width)
312 return PoisonValue::get(FVTy->getElementType());
313 }
314
315 if (Constant *C = dyn_cast<Constant>(V))
316 return C->getAggregateElement(EltNo);
317
319 // If this is an insert to a variable element, we don't know what it is.
320 uint64_t IIElt;
321 if (!match(III->getOperand(2), m_ConstantInt(IIElt)))
322 return nullptr;
323
324 // If this is an insert to the element we are looking for, return the
325 // inserted value.
326 if (EltNo == IIElt)
327 return III->getOperand(1);
328
329 // Guard against infinite loop on malformed, unreachable IR.
330 if (III == III->getOperand(0))
331 return nullptr;
332
333 // Otherwise, the insertelement doesn't modify the value, recurse on its
334 // vector input.
335 return findScalarElement(III->getOperand(0), EltNo);
336 }
337
339 // Restrict the following transformation to fixed-length vector.
340 if (SVI && isa<FixedVectorType>(SVI->getType())) {
341 unsigned LHSWidth =
342 cast<FixedVectorType>(SVI->getOperand(0)->getType())->getNumElements();
343 int InEl = SVI->getMaskValue(EltNo);
344 if (InEl < 0)
345 return PoisonValue::get(VTy->getElementType());
346 if (InEl < (int)LHSWidth)
347 return findScalarElement(SVI->getOperand(0), InEl);
348 return findScalarElement(SVI->getOperand(1), InEl - LHSWidth);
349 }
350
351 // Extract a value from a vector add operation with a constant zero.
352 // TODO: Use getBinOpIdentity() to generalize this.
353 Value *Val; Constant *C;
354 if (match(V, m_Add(m_Value(Val), m_Constant(C))))
355 if (Constant *Elt = C->getAggregateElement(EltNo))
356 if (Elt->isNullValue())
357 return findScalarElement(Val, EltNo);
358
359 // If the vector is a splat then we can trivially find the scalar element.
361 if (Value *Splat = getSplatValue(V))
362 if (EltNo < VTy->getElementCount().getKnownMinValue())
363 return Splat;
364
365 // Otherwise, we don't know.
366 return nullptr;
367}
368
370 int SplatIndex = -1;
371 for (int M : Mask) {
372 // Ignore invalid (undefined) mask elements.
373 if (M < 0)
374 continue;
375
376 // There can be only 1 non-negative mask element value if this is a splat.
377 if (SplatIndex != -1 && SplatIndex != M)
378 return -1;
379
380 // Initialize the splat index to the 1st non-negative mask element.
381 SplatIndex = M;
382 }
383 assert((SplatIndex == -1 || SplatIndex >= 0) && "Negative index?");
384 return SplatIndex;
385}
386
387/// Get splat value if the input is a splat vector or return nullptr.
388/// This function is not fully general. It checks only 2 cases:
389/// the input value is (1) a splat constant vector or (2) a sequence
390/// of instructions that broadcasts a scalar at element 0.
392 if (isa<VectorType>(V->getType()))
393 if (auto *C = dyn_cast<Constant>(V))
394 return C->getSplatValue();
395
396 // shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...>
397 Value *Splat;
398 if (match(V,
400 m_Value(), m_ZeroMask())))
401 return Splat;
402
403 return nullptr;
404}
405
406bool llvm::isSplatValue(const Value *V, int Index, unsigned Depth) {
407 assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
408
409 if (isa<VectorType>(V->getType())) {
410 if (isa<UndefValue>(V))
411 return true;
412 // FIXME: We can allow undefs, but if Index was specified, we may want to
413 // check that the constant is defined at that index.
414 if (auto *C = dyn_cast<Constant>(V))
415 return C->getSplatValue() != nullptr;
416 }
417
418 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) {
419 // FIXME: We can safely allow undefs here. If Index was specified, we will
420 // check that the mask elt is defined at the required index.
421 if (!all_equal(Shuf->getShuffleMask()))
422 return false;
423
424 // Match any index.
425 if (Index == -1)
426 return true;
427
428 // Match a specific element. The mask should be defined at and match the
429 // specified index.
430 return Shuf->getMaskValue(Index) == Index;
431 }
432
433 // The remaining tests are all recursive, so bail out if we hit the limit.
435 return false;
436
437 // If both operands of a binop are splats, the result is a splat.
438 Value *X, *Y, *Z;
439 if (match(V, m_BinOp(m_Value(X), m_Value(Y))))
440 return isSplatValue(X, Index, Depth) && isSplatValue(Y, Index, Depth);
441
442 // If all operands of a select are splats, the result is a splat.
443 if (match(V, m_Select(m_Value(X), m_Value(Y), m_Value(Z))))
444 return isSplatValue(X, Index, Depth) && isSplatValue(Y, Index, Depth) &&
445 isSplatValue(Z, Index, Depth);
446
447 // TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
448
449 return false;
450}
451
453 const APInt &DemandedElts, APInt &DemandedLHS,
454 APInt &DemandedRHS, bool AllowUndefElts) {
455 DemandedLHS = DemandedRHS = APInt::getZero(SrcWidth);
456
457 // Early out if we don't demand any elements.
458 if (DemandedElts.isZero())
459 return true;
460
461 // Simple case of a shuffle with zeroinitializer.
462 if (all_of(Mask, [](int Elt) { return Elt == 0; })) {
463 DemandedLHS.setBit(0);
464 return true;
465 }
466
467 for (unsigned I = 0, E = Mask.size(); I != E; ++I) {
468 int M = Mask[I];
469 assert((-1 <= M) && (M < (SrcWidth * 2)) &&
470 "Invalid shuffle mask constant");
471
472 if (!DemandedElts[I] || (AllowUndefElts && (M < 0)))
473 continue;
474
475 // For undef elements, we don't know anything about the common state of
476 // the shuffle result.
477 if (M < 0)
478 return false;
479
480 if (M < SrcWidth)
481 DemandedLHS.setBit(M);
482 else
483 DemandedRHS.setBit(M - SrcWidth);
484 }
485
486 return true;
487}
488
490 std::array<std::pair<int, int>, 2> &SrcInfo) {
491 const int SignalValue = NumElts * 2;
492 SrcInfo[0] = {-1, SignalValue};
493 SrcInfo[1] = {-1, SignalValue};
494 for (auto [i, M] : enumerate(Mask)) {
495 if (M < 0)
496 continue;
497 int Src = M >= NumElts;
498 int Diff = (int)i - (M % NumElts);
499 bool Match = false;
500 for (int j = 0; j < 2; j++) {
501 auto &[SrcE, DiffE] = SrcInfo[j];
502 if (SrcE == -1) {
503 assert(DiffE == SignalValue);
504 SrcE = Src;
505 DiffE = Diff;
506 }
507 if (SrcE == Src && DiffE == Diff) {
508 Match = true;
509 break;
510 }
511 }
512 if (!Match)
513 return false;
514 }
515 // Avoid all undef masks
516 return SrcInfo[0].first != -1;
517}
518
520 SmallVectorImpl<int> &ScaledMask) {
521 assert(Scale > 0 && "Unexpected scaling factor");
522
523 // Fast-path: if no scaling, then it is just a copy.
524 if (Scale == 1) {
525 ScaledMask.assign(Mask.begin(), Mask.end());
526 return;
527 }
528
529 ScaledMask.clear();
530 for (int MaskElt : Mask) {
531 if (MaskElt >= 0) {
532 assert(((uint64_t)Scale * MaskElt + (Scale - 1)) <= INT32_MAX &&
533 "Overflowed 32-bits");
534 }
535 for (int SliceElt = 0; SliceElt != Scale; ++SliceElt)
536 ScaledMask.push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + SliceElt);
537 }
538}
539
541 SmallVectorImpl<int> &ScaledMask) {
542 assert(Scale > 0 && "Unexpected scaling factor");
543
544 // Fast-path: if no scaling, then it is just a copy.
545 if (Scale == 1) {
546 ScaledMask.assign(Mask.begin(), Mask.end());
547 return true;
548 }
549
550 // We must map the original elements down evenly to a type with less elements.
551 int NumElts = Mask.size();
552 if (NumElts % Scale != 0)
553 return false;
554
555 ScaledMask.clear();
556 ScaledMask.reserve(NumElts / Scale);
557
558 // Step through the input mask by splitting into Scale-sized slices.
559 do {
560 ArrayRef<int> MaskSlice = Mask.take_front(Scale);
561 assert((int)MaskSlice.size() == Scale && "Expected Scale-sized slice.");
562
563 // The first element of the slice determines how we evaluate this slice.
564 int SliceFront = MaskSlice.front();
565 if (SliceFront < 0) {
566 // Negative values (undef or other "sentinel" values) must be equal across
567 // the entire slice.
568 if (!all_equal(MaskSlice))
569 return false;
570 ScaledMask.push_back(SliceFront);
571 } else {
572 // A positive mask element must be cleanly divisible.
573 if (SliceFront % Scale != 0)
574 return false;
575 // Elements of the slice must be consecutive.
576 for (int i = 1; i < Scale; ++i)
577 if (MaskSlice[i] != SliceFront + i)
578 return false;
579 ScaledMask.push_back(SliceFront / Scale);
580 }
581 Mask = Mask.drop_front(Scale);
582 } while (!Mask.empty());
583
584 assert((int)ScaledMask.size() * Scale == NumElts && "Unexpected scaled mask");
585
586 // All elements of the original mask can be scaled down to map to the elements
587 // of a mask with wider elements.
588 return true;
589}
590
592 SmallVectorImpl<int> &NewMask) {
593 unsigned NumElts = M.size();
594 if (NumElts % 2 != 0)
595 return false;
596
597 NewMask.clear();
598 for (unsigned i = 0; i < NumElts; i += 2) {
599 int M0 = M[i];
600 int M1 = M[i + 1];
601
602 // If both elements are undef, new mask is undef too.
603 if (M0 == -1 && M1 == -1) {
604 NewMask.push_back(-1);
605 continue;
606 }
607
608 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
609 NewMask.push_back(M1 / 2);
610 continue;
611 }
612
613 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
614 NewMask.push_back(M0 / 2);
615 continue;
616 }
617
618 NewMask.clear();
619 return false;
620 }
621
622 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
623 return true;
624}
625
626bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
627 SmallVectorImpl<int> &ScaledMask) {
628 unsigned NumSrcElts = Mask.size();
629 assert(NumSrcElts > 0 && NumDstElts > 0 && "Unexpected scaling factor");
630
631 // Fast-path: if no scaling, then it is just a copy.
632 if (NumSrcElts == NumDstElts) {
633 ScaledMask.assign(Mask.begin(), Mask.end());
634 return true;
635 }
636
637 // Ensure we can find a whole scale factor.
638 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
639 "Unexpected scaling factor");
640
641 if (NumSrcElts > NumDstElts) {
642 int Scale = NumSrcElts / NumDstElts;
643 return widenShuffleMaskElts(Scale, Mask, ScaledMask);
644 }
645
646 int Scale = NumDstElts / NumSrcElts;
647 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
648 return true;
649}
650
652 SmallVectorImpl<int> &ScaledMask) {
653 std::array<SmallVector<int, 16>, 2> TmpMasks;
654 SmallVectorImpl<int> *Output = &TmpMasks[0], *Tmp = &TmpMasks[1];
655 ArrayRef<int> InputMask = Mask;
656 for (unsigned Scale = 2; Scale <= InputMask.size(); ++Scale) {
657 while (widenShuffleMaskElts(Scale, InputMask, *Output)) {
658 InputMask = *Output;
659 std::swap(Output, Tmp);
660 }
661 }
662 ScaledMask.assign(InputMask.begin(), InputMask.end());
663}
664
666 ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
667 unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
668 function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
669 function_ref<void(ArrayRef<int>, unsigned, unsigned, bool)>
670 ManyInputsAction) {
671 SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
672 // Try to perform better estimation of the permutation.
673 // 1. Split the source/destination vectors into real registers.
674 // 2. Do the mask analysis to identify which real registers are
675 // permuted.
676 int Sz = Mask.size();
677 unsigned SzDest = Sz / NumOfDestRegs;
678 unsigned SzSrc = Sz / NumOfSrcRegs;
679 for (unsigned I = 0; I < NumOfDestRegs; ++I) {
680 auto &RegMasks = Res[I];
681 RegMasks.assign(2 * NumOfSrcRegs, {});
682 // Check that the values in dest registers are in the one src
683 // register.
684 for (unsigned K = 0; K < SzDest; ++K) {
685 int Idx = I * SzDest + K;
686 if (Idx == Sz)
687 break;
688 if (Mask[Idx] >= 2 * Sz || Mask[Idx] == PoisonMaskElem)
689 continue;
690 int MaskIdx = Mask[Idx] % Sz;
691 int SrcRegIdx = MaskIdx / SzSrc + (Mask[Idx] >= Sz ? NumOfSrcRegs : 0);
692 // Add a cost of PermuteTwoSrc for each new source register permute,
693 // if we have more than one source registers.
694 if (RegMasks[SrcRegIdx].empty())
695 RegMasks[SrcRegIdx].assign(SzDest, PoisonMaskElem);
696 RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
697 }
698 }
699 // Process split mask.
700 for (unsigned I : seq<unsigned>(NumOfUsedRegs)) {
701 auto &Dest = Res[I];
702 int NumSrcRegs =
703 count_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
704 switch (NumSrcRegs) {
705 case 0:
706 // No input vectors were used!
707 NoInputAction();
708 break;
709 case 1: {
710 // Find the only mask with at least single undef mask elem.
711 auto *It =
712 find_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
713 unsigned SrcReg = std::distance(Dest.begin(), It);
714 SingleInputAction(*It, SrcReg, I);
715 break;
716 }
717 default: {
718 // The first mask is a permutation of a single register. Since we have >2
719 // input registers to shuffle, we merge the masks for 2 first registers
720 // and generate a shuffle of 2 registers rather than the reordering of the
721 // first register and then shuffle with the second register. Next,
722 // generate the shuffles of the resulting register + the remaining
723 // registers from the list.
724 auto &&CombineMasks = [](MutableArrayRef<int> FirstMask,
725 ArrayRef<int> SecondMask) {
726 for (int Idx = 0, VF = FirstMask.size(); Idx < VF; ++Idx) {
727 if (SecondMask[Idx] != PoisonMaskElem) {
728 assert(FirstMask[Idx] == PoisonMaskElem &&
729 "Expected undefined mask element.");
730 FirstMask[Idx] = SecondMask[Idx] + VF;
731 }
732 }
733 };
734 auto &&NormalizeMask = [](MutableArrayRef<int> Mask) {
735 for (int Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
736 if (Mask[Idx] != PoisonMaskElem)
737 Mask[Idx] = Idx;
738 }
739 };
740 int SecondIdx;
741 bool NewReg = true;
742 do {
743 int FirstIdx = -1;
744 SecondIdx = -1;
745 MutableArrayRef<int> FirstMask, SecondMask;
746 for (unsigned I : seq<unsigned>(2 * NumOfSrcRegs)) {
747 SmallVectorImpl<int> &RegMask = Dest[I];
748 if (RegMask.empty())
749 continue;
750
751 if (FirstIdx == SecondIdx) {
752 FirstIdx = I;
753 FirstMask = RegMask;
754 continue;
755 }
756 SecondIdx = I;
757 SecondMask = RegMask;
758 CombineMasks(FirstMask, SecondMask);
759 ManyInputsAction(FirstMask, FirstIdx, SecondIdx, NewReg);
760 NewReg = false;
761 NormalizeMask(FirstMask);
762 RegMask.clear();
763 SecondMask = FirstMask;
764 SecondIdx = FirstIdx;
765 }
766 if (FirstIdx != SecondIdx && SecondIdx >= 0) {
767 CombineMasks(SecondMask, FirstMask);
768 ManyInputsAction(SecondMask, SecondIdx, FirstIdx, NewReg);
769 NewReg = false;
770 Dest[FirstIdx].clear();
771 NormalizeMask(SecondMask);
772 }
773 } while (SecondIdx >= 0);
774 break;
775 }
776 }
777 }
778}
779
780void llvm::getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth,
781 const APInt &DemandedElts,
782 APInt &DemandedLHS,
783 APInt &DemandedRHS) {
784 assert(VectorBitWidth >= 128 && "Vectors smaller than 128 bit not supported");
785 int NumLanes = VectorBitWidth / 128;
786 int NumElts = DemandedElts.getBitWidth();
787 int NumEltsPerLane = NumElts / NumLanes;
788 int HalfEltsPerLane = NumEltsPerLane / 2;
789
790 DemandedLHS = APInt::getZero(NumElts);
791 DemandedRHS = APInt::getZero(NumElts);
792
793 // Map DemandedElts to the horizontal operands.
794 for (int Idx = 0; Idx != NumElts; ++Idx) {
795 if (!DemandedElts[Idx])
796 continue;
797 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
798 int LocalIdx = Idx % NumEltsPerLane;
799 if (LocalIdx < HalfEltsPerLane) {
800 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx);
801 } else {
802 LocalIdx -= HalfEltsPerLane;
803 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx);
804 }
805 }
806}
807
810 const TargetTransformInfo *TTI) {
811
812 // DemandedBits will give us every value's live-out bits. But we want
813 // to ensure no extra casts would need to be inserted, so every DAG
814 // of connected values must have the same minimum bitwidth.
820 SmallPtrSet<Instruction *, 4> InstructionSet;
822
823 // Determine the roots. We work bottom-up, from truncs or icmps.
824 bool SeenExtFromIllegalType = false;
825 for (auto *BB : Blocks)
826 for (auto &I : *BB) {
827 InstructionSet.insert(&I);
828
829 if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) &&
830 !TTI->isTypeLegal(I.getOperand(0)->getType()))
831 SeenExtFromIllegalType = true;
832
833 // Only deal with non-vector integers up to 64-bits wide.
834 if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) &&
835 !I.getType()->isVectorTy() &&
836 I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
837 // Don't make work for ourselves. If we know the loaded type is legal,
838 // don't add it to the worklist.
839 if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType()))
840 continue;
841
842 Worklist.push_back(&I);
843 Roots.insert(&I);
844 }
845 }
846 // Early exit.
847 if (Worklist.empty() || (TTI && !SeenExtFromIllegalType))
848 return MinBWs;
849
850 // Now proceed breadth-first, unioning values together.
851 while (!Worklist.empty()) {
852 Instruction *I = Worklist.pop_back_val();
853 Value *Leader = ECs.getOrInsertLeaderValue(I);
854
855 if (!Visited.insert(I).second)
856 continue;
857
858 // If we encounter a type that is larger than 64 bits, we can't represent
859 // it so bail out.
860 if (DB.getDemandedBits(I).getBitWidth() > 64)
862
863 uint64_t V = DB.getDemandedBits(I).getZExtValue();
864 DBits[Leader] |= V;
865 DBits[I] = V;
866
867 // Casts, loads and instructions outside of our range terminate a chain
868 // successfully.
870 !InstructionSet.count(I))
871 continue;
872
873 // Unsafe casts terminate a chain unsuccessfully. We can't do anything
874 // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
875 // transform anything that relies on them.
877 !I->getType()->isIntegerTy()) {
878 DBits[Leader] |= ~0ULL;
879 continue;
880 }
881
882 // We don't modify the types of PHIs. Reductions will already have been
883 // truncated if possible, and inductions' sizes will have been chosen by
884 // indvars.
885 if (isa<PHINode>(I))
886 continue;
887
888 // Don't modify the types of operands of a call, as doing that would cause a
889 // signature mismatch.
890 if (isa<CallBase>(I))
891 continue;
892
893 if (DBits[Leader] == ~0ULL)
894 // All bits demanded, no point continuing.
895 continue;
896
897 for (Value *O : I->operands()) {
898 ECs.unionSets(Leader, O);
899 if (auto *OI = dyn_cast<Instruction>(O))
900 Worklist.push_back(OI);
901 }
902 }
903
904 // Now we've discovered all values, walk them to see if there are
905 // any users we didn't see. If there are, we can't optimize that
906 // chain.
907 for (auto &I : DBits)
908 for (auto *U : I.first->users())
909 if (U->getType()->isIntegerTy() && DBits.count(U) == 0)
910 DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL;
911
912 for (const auto &E : ECs) {
913 if (!E->isLeader())
914 continue;
915 uint64_t LeaderDemandedBits = 0;
916 for (Value *M : ECs.members(*E))
917 LeaderDemandedBits |= DBits[M];
918
919 uint64_t MinBW = llvm::bit_width(LeaderDemandedBits);
920 // Round up to a power of 2
921 MinBW = llvm::bit_ceil(MinBW);
922
923 // We don't modify the types of PHIs. Reductions will already have been
924 // truncated if possible, and inductions' sizes will have been chosen by
925 // indvars.
926 // If we are required to shrink a PHI, abandon this entire equivalence class.
927 bool Abort = false;
928 for (Value *M : ECs.members(*E))
929 if (isa<PHINode>(M) && MinBW < M->getType()->getScalarSizeInBits()) {
930 Abort = true;
931 break;
932 }
933 if (Abort)
934 continue;
935
936 for (Value *M : ECs.members(*E)) {
937 auto *MI = dyn_cast<Instruction>(M);
938 if (!MI)
939 continue;
940 Type *Ty = M->getType();
941 if (Roots.count(MI))
942 Ty = MI->getOperand(0)->getType();
943
944 if (MinBW >= Ty->getScalarSizeInBits())
945 continue;
946
947 // If any of M's operands demand more bits than MinBW then M cannot be
948 // performed safely in MinBW.
949 auto *Call = dyn_cast<CallBase>(MI);
950 auto Ops = Call ? Call->args() : MI->operands();
951 if (any_of(Ops, [&DB, MinBW](Use &U) {
952 auto *CI = dyn_cast<ConstantInt>(U);
953 // For constants shift amounts, check if the shift would result in
954 // poison.
955 if (CI &&
957 U.getOperandNo() == 1)
958 return CI->uge(MinBW);
959 uint64_t BW = bit_width(DB.getDemandedBits(&U).getZExtValue());
960 return bit_ceil(BW) > MinBW;
961 }))
962 continue;
963
964 MinBWs[MI] = MinBW;
965 }
966 }
967
968 return MinBWs;
969}
970
971/// Add all access groups in @p AccGroups to @p List.
972template <typename ListT>
973static void addToAccessGroupList(ListT &List, MDNode *AccGroups) {
974 // Interpret an access group as a list containing itself.
975 if (AccGroups->getNumOperands() == 0) {
976 assert(isValidAsAccessGroup(AccGroups) && "Node must be an access group");
977 List.insert(AccGroups);
978 return;
979 }
980
981 for (const auto &AccGroupListOp : AccGroups->operands()) {
982 auto *Item = cast<MDNode>(AccGroupListOp.get());
983 assert(isValidAsAccessGroup(Item) && "List item must be an access group");
984 List.insert(Item);
985 }
986}
987
988MDNode *llvm::uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2) {
989 if (!AccGroups1)
990 return AccGroups2;
991 if (!AccGroups2)
992 return AccGroups1;
993 if (AccGroups1 == AccGroups2)
994 return AccGroups1;
995
997 addToAccessGroupList(Union, AccGroups1);
998 addToAccessGroupList(Union, AccGroups2);
999
1000 if (Union.size() == 0)
1001 return nullptr;
1002 if (Union.size() == 1)
1003 return cast<MDNode>(Union.front());
1004
1005 LLVMContext &Ctx = AccGroups1->getContext();
1006 return MDNode::get(Ctx, Union.getArrayRef());
1007}
1008
1010 const Instruction *Inst2) {
1011 bool MayAccessMem1 = Inst1->mayReadOrWriteMemory();
1012 bool MayAccessMem2 = Inst2->mayReadOrWriteMemory();
1013
1014 if (!MayAccessMem1 && !MayAccessMem2)
1015 return nullptr;
1016 if (!MayAccessMem1)
1017 return Inst2->getMetadata(LLVMContext::MD_access_group);
1018 if (!MayAccessMem2)
1019 return Inst1->getMetadata(LLVMContext::MD_access_group);
1020
1021 MDNode *MD1 = Inst1->getMetadata(LLVMContext::MD_access_group);
1022 MDNode *MD2 = Inst2->getMetadata(LLVMContext::MD_access_group);
1023 if (!MD1 || !MD2)
1024 return nullptr;
1025 if (MD1 == MD2)
1026 return MD1;
1027
1028 // Use set for scalable 'contains' check.
1029 SmallPtrSet<Metadata *, 4> AccGroupSet2;
1030 addToAccessGroupList(AccGroupSet2, MD2);
1031
1032 SmallVector<Metadata *, 4> Intersection;
1033 if (MD1->getNumOperands() == 0) {
1034 assert(isValidAsAccessGroup(MD1) && "Node must be an access group");
1035 if (AccGroupSet2.count(MD1))
1036 Intersection.push_back(MD1);
1037 } else {
1038 for (const MDOperand &Node : MD1->operands()) {
1039 auto *Item = cast<MDNode>(Node.get());
1040 assert(isValidAsAccessGroup(Item) && "List item must be an access group");
1041 if (AccGroupSet2.count(Item))
1042 Intersection.push_back(Item);
1043 }
1044 }
1045
1046 if (Intersection.size() == 0)
1047 return nullptr;
1048 if (Intersection.size() == 1)
1049 return cast<MDNode>(Intersection.front());
1050
1051 LLVMContext &Ctx = Inst1->getContext();
1052 return MDNode::get(Ctx, Intersection);
1053}
1054
1055/// Add metadata from \p Inst to \p Metadata, if it can be preserved after
1056/// vectorization.
1058 Instruction *Inst,
1059 SmallVectorImpl<std::pair<unsigned, MDNode *>> &Metadata) {
1061 static const unsigned SupportedIDs[] = {
1062 LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
1063 LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
1064 LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
1065 LLVMContext::MD_access_group, LLVMContext::MD_mmra};
1066
1067 // Remove any unsupported metadata kinds from Metadata.
1068 for (unsigned Idx = 0; Idx != Metadata.size();) {
1069 if (is_contained(SupportedIDs, Metadata[Idx].first)) {
1070 ++Idx;
1071 } else {
1072 // Swap element to end and remove it.
1073 std::swap(Metadata[Idx], Metadata.back());
1074 Metadata.pop_back();
1075 }
1076 }
1077}
1078
1079/// \returns \p I after propagating metadata from \p VL.
1081 if (VL.empty())
1082 return Inst;
1085
1086 for (auto &[Kind, MD] : Metadata) {
1087 // Skip MMRA metadata if the instruction cannot have it.
1088 if (Kind == LLVMContext::MD_mmra && !canInstructionHaveMMRAs(*Inst))
1089 continue;
1090
1091 for (int J = 1, E = VL.size(); MD && J != E; ++J) {
1092 const Instruction *IJ = cast<Instruction>(VL[J]);
1093 MDNode *IMD = IJ->getMetadata(Kind);
1094
1095 switch (Kind) {
1096 case LLVMContext::MD_mmra: {
1097 MD = MMRAMetadata::combine(Inst->getContext(), MD, IMD);
1098 break;
1099 }
1100 case LLVMContext::MD_tbaa:
1101 MD = MDNode::getMostGenericTBAA(MD, IMD);
1102 break;
1103 case LLVMContext::MD_alias_scope:
1105 break;
1106 case LLVMContext::MD_fpmath:
1107 MD = MDNode::getMostGenericFPMath(MD, IMD);
1108 break;
1109 case LLVMContext::MD_noalias:
1110 case LLVMContext::MD_nontemporal:
1111 case LLVMContext::MD_invariant_load:
1112 MD = MDNode::intersect(MD, IMD);
1113 break;
1114 case LLVMContext::MD_access_group:
1115 MD = intersectAccessGroups(Inst, IJ);
1116 break;
1117 default:
1118 llvm_unreachable("unhandled metadata");
1119 }
1120 }
1121
1122 Inst->setMetadata(Kind, MD);
1123 }
1124
1125 return Inst;
1126}
1127
1128Constant *
1130 const InterleaveGroup<Instruction> &Group) {
1131 // All 1's means mask is not needed.
1132 if (Group.isFull())
1133 return nullptr;
1134
1135 // TODO: support reversed access.
1136 assert(!Group.isReverse() && "Reversed group not supported.");
1137
1139 for (unsigned i = 0; i < VF; i++)
1140 for (unsigned j = 0; j < Group.getFactor(); ++j) {
1141 unsigned HasMember = Group.getMember(j) ? 1 : 0;
1142 Mask.push_back(Builder.getInt1(HasMember));
1143 }
1144
1145 return ConstantVector::get(Mask);
1146}
1147
1149llvm::createReplicatedMask(unsigned ReplicationFactor, unsigned VF) {
1150 SmallVector<int, 16> MaskVec;
1151 for (unsigned i = 0; i < VF; i++)
1152 for (unsigned j = 0; j < ReplicationFactor; j++)
1153 MaskVec.push_back(i);
1154
1155 return MaskVec;
1156}
1157
1159 unsigned NumVecs) {
1161 for (unsigned i = 0; i < VF; i++)
1162 for (unsigned j = 0; j < NumVecs; j++)
1163 Mask.push_back(j * VF + i);
1164
1165 return Mask;
1166}
1167
1169llvm::createStrideMask(unsigned Start, unsigned Stride, unsigned VF) {
1171 for (unsigned i = 0; i < VF; i++)
1172 Mask.push_back(Start + i * Stride);
1173
1174 return Mask;
1175}
1176
1178 unsigned NumInts,
1179 unsigned NumUndefs) {
1181 for (unsigned i = 0; i < NumInts; i++)
1182 Mask.push_back(Start + i);
1183
1184 for (unsigned i = 0; i < NumUndefs; i++)
1185 Mask.push_back(-1);
1186
1187 return Mask;
1188}
1189
1191 unsigned NumElts) {
1192 // Avoid casts in the loop and make sure we have a reasonable number.
1193 int NumEltsSigned = NumElts;
1194 assert(NumEltsSigned > 0 && "Expected smaller or non-zero element count");
1195
1196 // If the mask chooses an element from operand 1, reduce it to choose from the
1197 // corresponding element of operand 0. Undef mask elements are unchanged.
1198 SmallVector<int, 16> UnaryMask;
1199 for (int MaskElt : Mask) {
1200 assert((MaskElt < NumEltsSigned * 2) && "Expected valid shuffle mask");
1201 int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
1202 UnaryMask.push_back(UnaryElt);
1203 }
1204 return UnaryMask;
1205}
1206
1207/// A helper function for concatenating vectors. This function concatenates two
1208/// vectors having the same element type. If the second vector has fewer
1209/// elements than the first, it is padded with undefs.
1211 Value *V2) {
1212 VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
1213 VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
1214 assert(VecTy1 && VecTy2 &&
1215 VecTy1->getScalarType() == VecTy2->getScalarType() &&
1216 "Expect two vectors with the same element type");
1217
1218 unsigned NumElts1 = cast<FixedVectorType>(VecTy1)->getNumElements();
1219 unsigned NumElts2 = cast<FixedVectorType>(VecTy2)->getNumElements();
1220 assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
1221
1222 if (NumElts1 > NumElts2) {
1223 // Extend with UNDEFs.
1224 V2 = Builder.CreateShuffleVector(
1225 V2, createSequentialMask(0, NumElts2, NumElts1 - NumElts2));
1226 }
1227
1228 return Builder.CreateShuffleVector(
1229 V1, V2, createSequentialMask(0, NumElts1 + NumElts2, 0));
1230}
1231
1233 ArrayRef<Value *> Vecs) {
1234 unsigned NumVecs = Vecs.size();
1235 assert(NumVecs > 1 && "Should be at least two vectors");
1236
1238 ResList.append(Vecs.begin(), Vecs.end());
1239 do {
1241 for (unsigned i = 0; i < NumVecs - 1; i += 2) {
1242 Value *V0 = ResList[i], *V1 = ResList[i + 1];
1243 assert((V0->getType() == V1->getType() || i == NumVecs - 2) &&
1244 "Only the last vector may have a different type");
1245
1246 TmpList.push_back(concatenateTwoVectors(Builder, V0, V1));
1247 }
1248
1249 // Push the last vector if the total number of vectors is odd.
1250 if (NumVecs % 2 != 0)
1251 TmpList.push_back(ResList[NumVecs - 1]);
1252
1253 ResList = TmpList;
1254 NumVecs = ResList.size();
1255 } while (NumVecs > 1);
1256
1257 return ResList[0];
1258}
1259
1261 assert(isa<VectorType>(Mask->getType()) &&
1262 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1263 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1264 1 &&
1265 "Mask must be a vector of i1");
1266
1267 auto *ConstMask = dyn_cast<Constant>(Mask);
1268 if (!ConstMask)
1269 return false;
1270 if (ConstMask->isNullValue() || isa<UndefValue>(ConstMask))
1271 return true;
1272 if (isa<ScalableVectorType>(ConstMask->getType()))
1273 return false;
1274 for (unsigned
1275 I = 0,
1276 E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
1277 I != E; ++I) {
1278 if (auto *MaskElt = ConstMask->getAggregateElement(I))
1279 if (MaskElt->isNullValue() || isa<UndefValue>(MaskElt))
1280 continue;
1281 return false;
1282 }
1283 return true;
1284}
1285
1287 assert(isa<VectorType>(Mask->getType()) &&
1288 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1289 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1290 1 &&
1291 "Mask must be a vector of i1");
1292
1293 auto *ConstMask = dyn_cast<Constant>(Mask);
1294 if (!ConstMask)
1295 return false;
1296 if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
1297 return true;
1298 if (isa<ScalableVectorType>(ConstMask->getType()))
1299 return false;
1300 for (unsigned
1301 I = 0,
1302 E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
1303 I != E; ++I) {
1304 if (auto *MaskElt = ConstMask->getAggregateElement(I))
1305 if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
1306 continue;
1307 return false;
1308 }
1309 return true;
1310}
1311
1313 assert(isa<VectorType>(Mask->getType()) &&
1314 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1315 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1316 1 &&
1317 "Mask must be a vector of i1");
1318
1319 auto *ConstMask = dyn_cast<Constant>(Mask);
1320 if (!ConstMask)
1321 return false;
1322 if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
1323 return true;
1324 if (isa<ScalableVectorType>(ConstMask->getType()))
1325 return false;
1326 for (unsigned
1327 I = 0,
1328 E = cast<FixedVectorType>(ConstMask->getType())->getNumElements();
1329 I != E; ++I) {
1330 if (auto *MaskElt = ConstMask->getAggregateElement(I))
1331 if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
1332 return true;
1333 }
1334 return false;
1335}
1336
1337/// TODO: This is a lot like known bits, but for
1338/// vectors. Is there something we can common this with?
1340 assert(isa<FixedVectorType>(Mask->getType()) &&
1341 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1342 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1343 1 &&
1344 "Mask must be a fixed width vector of i1");
1345
1346 const unsigned VWidth =
1347 cast<FixedVectorType>(Mask->getType())->getNumElements();
1348 APInt DemandedElts = APInt::getAllOnes(VWidth);
1349 if (auto *CV = dyn_cast<ConstantVector>(Mask))
1350 for (unsigned i = 0; i < VWidth; i++)
1351 if (CV->getAggregateElement(i)->isNullValue())
1352 DemandedElts.clearBit(i);
1353 return DemandedElts;
1354}
1355
1356bool InterleavedAccessInfo::isStrided(int Stride) {
1357 unsigned Factor = std::abs(Stride);
1358 return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1359}
1360
1361void InterleavedAccessInfo::collectConstStrideAccesses(
1363 const DenseMap<Value*, const SCEV*> &Strides) {
1364 auto &DL = TheLoop->getHeader()->getDataLayout();
1365
1366 // Since it's desired that the load/store instructions be maintained in
1367 // "program order" for the interleaved access analysis, we have to visit the
1368 // blocks in the loop in reverse postorder (i.e., in a topological order).
1369 // Such an ordering will ensure that any load/store that may be executed
1370 // before a second load/store will precede the second load/store in
1371 // AccessStrideInfo.
1372 LoopBlocksDFS DFS(TheLoop);
1373 DFS.perform(LI);
1374 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
1375 for (auto &I : *BB) {
1377 if (!Ptr)
1378 continue;
1379 Type *ElementTy = getLoadStoreType(&I);
1380
1381 // Currently, codegen doesn't support cases where the type size doesn't
1382 // match the alloc size. Skip them for now.
1383 uint64_t Size = DL.getTypeAllocSize(ElementTy);
1384 if (Size * 8 != DL.getTypeSizeInBits(ElementTy))
1385 continue;
1386
1387 // We don't check wrapping here because we don't know yet if Ptr will be
1388 // part of a full group or a group with gaps. Checking wrapping for all
1389 // pointers (even those that end up in groups with no gaps) will be overly
1390 // conservative. For full groups, wrapping should be ok since if we would
1391 // wrap around the address space we would do a memory access at nullptr
1392 // even without the transformation. The wrapping checks are therefore
1393 // deferred until after we've formed the interleaved groups.
1394 int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
1395 /*Assume=*/true, /*ShouldCheckWrap=*/false)
1396 .value_or(0);
1397
1398 const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
1399 AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
1401 }
1402}
1403
1404// Analyze interleaved accesses and collect them into interleaved load and
1405// store groups.
1406//
1407// When generating code for an interleaved load group, we effectively hoist all
1408// loads in the group to the location of the first load in program order. When
1409// generating code for an interleaved store group, we sink all stores to the
1410// location of the last store. This code motion can change the order of load
1411// and store instructions and may break dependences.
1412//
1413// The code generation strategy mentioned above ensures that we won't violate
1414// any write-after-read (WAR) dependences.
1415//
1416// E.g., for the WAR dependence: a = A[i]; // (1)
1417// A[i] = b; // (2)
1418//
1419// The store group of (2) is always inserted at or below (2), and the load
1420// group of (1) is always inserted at or above (1). Thus, the instructions will
1421// never be reordered. All other dependences are checked to ensure the
1422// correctness of the instruction reordering.
1423//
1424// The algorithm visits all memory accesses in the loop in bottom-up program
1425// order. Program order is established by traversing the blocks in the loop in
1426// reverse postorder when collecting the accesses.
1427//
1428// We visit the memory accesses in bottom-up order because it can simplify the
1429// construction of store groups in the presence of write-after-write (WAW)
1430// dependences.
1431//
1432// E.g., for the WAW dependence: A[i] = a; // (1)
1433// A[i] = b; // (2)
1434// A[i + 1] = c; // (3)
1435//
1436// We will first create a store group with (3) and (2). (1) can't be added to
1437// this group because it and (2) are dependent. However, (1) can be grouped
1438// with other accesses that may precede it in program order. Note that a
1439// bottom-up order does not imply that WAW dependences should not be checked.
1441 bool EnablePredicatedInterleavedMemAccesses) {
1442 LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
1443 const auto &Strides = LAI->getSymbolicStrides();
1444
1445 // Holds all accesses with a constant stride.
1447 collectConstStrideAccesses(AccessStrideInfo, Strides);
1448
1449 if (AccessStrideInfo.empty())
1450 return;
1451
1452 // Collect the dependences in the loop.
1453 collectDependences();
1454
1455 // Holds all interleaved store groups temporarily.
1457 // Holds all interleaved load groups temporarily.
1459 // Groups added to this set cannot have new members added.
1460 SmallPtrSet<InterleaveGroup<Instruction> *, 4> CompletedLoadGroups;
1461
1462 // Search in bottom-up program order for pairs of accesses (A and B) that can
1463 // form interleaved load or store groups. In the algorithm below, access A
1464 // precedes access B in program order. We initialize a group for B in the
1465 // outer loop of the algorithm, and then in the inner loop, we attempt to
1466 // insert each A into B's group if:
1467 //
1468 // 1. A and B have the same stride,
1469 // 2. A and B have the same memory object size, and
1470 // 3. A belongs in B's group according to its distance from B.
1471 //
1472 // Special care is taken to ensure group formation will not break any
1473 // dependences.
1474 for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
1475 BI != E; ++BI) {
1476 Instruction *B = BI->first;
1477 StrideDescriptor DesB = BI->second;
1478
1479 // Initialize a group for B if it has an allowable stride. Even if we don't
1480 // create a group for B, we continue with the bottom-up algorithm to ensure
1481 // we don't break any of B's dependences.
1482 InterleaveGroup<Instruction> *GroupB = nullptr;
1483 if (isStrided(DesB.Stride) &&
1484 (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
1485 GroupB = getInterleaveGroup(B);
1486 if (!GroupB) {
1487 LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
1488 << '\n');
1489 GroupB = createInterleaveGroup(B, DesB.Stride, DesB.Alignment);
1490 if (B->mayWriteToMemory())
1491 StoreGroups.insert(GroupB);
1492 else
1493 LoadGroups.insert(GroupB);
1494 }
1495 }
1496
1497 for (auto AI = std::next(BI); AI != E; ++AI) {
1498 Instruction *A = AI->first;
1499 StrideDescriptor DesA = AI->second;
1500
1501 // Our code motion strategy implies that we can't have dependences
1502 // between accesses in an interleaved group and other accesses located
1503 // between the first and last member of the group. Note that this also
1504 // means that a group can't have more than one member at a given offset.
1505 // The accesses in a group can have dependences with other accesses, but
1506 // we must ensure we don't extend the boundaries of the group such that
1507 // we encompass those dependent accesses.
1508 //
1509 // For example, assume we have the sequence of accesses shown below in a
1510 // stride-2 loop:
1511 //
1512 // (1, 2) is a group | A[i] = a; // (1)
1513 // | A[i-1] = b; // (2) |
1514 // A[i-3] = c; // (3)
1515 // A[i] = d; // (4) | (2, 4) is not a group
1516 //
1517 // Because accesses (2) and (3) are dependent, we can group (2) with (1)
1518 // but not with (4). If we did, the dependent access (3) would be within
1519 // the boundaries of the (2, 4) group.
1520 auto DependentMember = [&](InterleaveGroup<Instruction> *Group,
1521 StrideEntry *A) -> Instruction * {
1522 for (uint32_t Index = 0; Index < Group->getFactor(); ++Index) {
1523 Instruction *MemberOfGroupB = Group->getMember(Index);
1524 if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
1525 A, &*AccessStrideInfo.find(MemberOfGroupB)))
1526 return MemberOfGroupB;
1527 }
1528 return nullptr;
1529 };
1530
1531 auto GroupA = getInterleaveGroup(A);
1532 // If A is a load, dependencies are tolerable, there's nothing to do here.
1533 // If both A and B belong to the same (store) group, they are independent,
1534 // even if dependencies have not been recorded.
1535 // If both GroupA and GroupB are null, there's nothing to do here.
1536 if (A->mayWriteToMemory() && GroupA != GroupB) {
1537 Instruction *DependentInst = nullptr;
1538 // If GroupB is a load group, we have to compare AI against all
1539 // members of GroupB because if any load within GroupB has a dependency
1540 // on AI, we need to mark GroupB as complete and also release the
1541 // store GroupA (if A belongs to one). The former prevents incorrect
1542 // hoisting of load B above store A while the latter prevents incorrect
1543 // sinking of store A below load B.
1544 if (GroupB && LoadGroups.contains(GroupB))
1545 DependentInst = DependentMember(GroupB, &*AI);
1546 else if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI))
1547 DependentInst = B;
1548
1549 if (DependentInst) {
1550 // A has a store dependence on B (or on some load within GroupB) and
1551 // is part of a store group. Release A's group to prevent illegal
1552 // sinking of A below B. A will then be free to form another group
1553 // with instructions that precede it.
1554 if (GroupA && StoreGroups.contains(GroupA)) {
1555 LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
1556 "dependence between "
1557 << *A << " and " << *DependentInst << '\n');
1558 StoreGroups.remove(GroupA);
1559 releaseGroup(GroupA);
1560 }
1561 // If B is a load and part of an interleave group, no earlier loads
1562 // can be added to B's interleave group, because this would mean the
1563 // DependentInst would move across store A. Mark the interleave group
1564 // as complete.
1565 if (GroupB && LoadGroups.contains(GroupB)) {
1566 LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
1567 << " as complete.\n");
1568 CompletedLoadGroups.insert(GroupB);
1569 }
1570 }
1571 }
1572 if (CompletedLoadGroups.contains(GroupB)) {
1573 // Skip trying to add A to B, continue to look for other conflicting A's
1574 // in groups to be released.
1575 continue;
1576 }
1577
1578 // At this point, we've checked for illegal code motion. If either A or B
1579 // isn't strided, there's nothing left to do.
1580 if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
1581 continue;
1582
1583 // Ignore A if it's already in a group or isn't the same kind of memory
1584 // operation as B.
1585 // Note that mayReadFromMemory() isn't mutually exclusive to
1586 // mayWriteToMemory in the case of atomic loads. We shouldn't see those
1587 // here, canVectorizeMemory() should have returned false - except for the
1588 // case we asked for optimization remarks.
1589 if (isInterleaved(A) ||
1590 (A->mayReadFromMemory() != B->mayReadFromMemory()) ||
1591 (A->mayWriteToMemory() != B->mayWriteToMemory()))
1592 continue;
1593
1594 // Check rules 1 and 2. Ignore A if its stride or size is different from
1595 // that of B.
1596 if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
1597 continue;
1598
1599 // Ignore A if the memory object of A and B don't belong to the same
1600 // address space
1602 continue;
1603
1604 // Calculate the distance from A to B.
1605 const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
1606 PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
1607 if (!DistToB)
1608 continue;
1609 int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
1610
1611 // Check rule 3. Ignore A if its distance to B is not a multiple of the
1612 // size.
1613 if (DistanceToB % static_cast<int64_t>(DesB.Size))
1614 continue;
1615
1616 // All members of a predicated interleave-group must have the same predicate,
1617 // and currently must reside in the same BB.
1618 BasicBlock *BlockA = A->getParent();
1619 BasicBlock *BlockB = B->getParent();
1620 if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
1621 (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
1622 continue;
1623
1624 // The index of A is the index of B plus A's distance to B in multiples
1625 // of the size.
1626 int IndexA =
1627 GroupB->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
1628
1629 // Try to insert A into B's group.
1630 if (GroupB->insertMember(A, IndexA, DesA.Alignment)) {
1631 LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
1632 << " into the interleave group with" << *B
1633 << '\n');
1634 InterleaveGroupMap[A] = GroupB;
1635
1636 // Set the first load in program order as the insert position.
1637 if (A->mayReadFromMemory())
1638 GroupB->setInsertPos(A);
1639 }
1640 } // Iteration over A accesses.
1641 } // Iteration over B accesses.
1642
1643 auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
1644 int Index,
1645 const char *FirstOrLast) -> bool {
1646 Instruction *Member = Group->getMember(Index);
1647 assert(Member && "Group member does not exist");
1648 Value *MemberPtr = getLoadStorePointerOperand(Member);
1649 Type *AccessTy = getLoadStoreType(Member);
1650 if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
1651 /*Assume=*/false, /*ShouldCheckWrap=*/true)
1652 .value_or(0))
1653 return false;
1654 LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
1655 << FirstOrLast
1656 << " group member potentially pointer-wrapping.\n");
1657 releaseGroup(Group);
1658 return true;
1659 };
1660
1661 // Remove interleaved groups with gaps whose memory
1662 // accesses may wrap around. We have to revisit the getPtrStride analysis,
1663 // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
1664 // not check wrapping (see documentation there).
1665 // FORNOW we use Assume=false;
1666 // TODO: Change to Assume=true but making sure we don't exceed the threshold
1667 // of runtime SCEV assumptions checks (thereby potentially failing to
1668 // vectorize altogether).
1669 // Additional optional optimizations:
1670 // TODO: If we are peeling the loop and we know that the first pointer doesn't
1671 // wrap then we can deduce that all pointers in the group don't wrap.
1672 // This means that we can forcefully peel the loop in order to only have to
1673 // check the first pointer for no-wrap. When we'll change to use Assume=true
1674 // we'll only need at most one runtime check per interleaved group.
1675 for (auto *Group : LoadGroups) {
1676 // Case 1: A full group. Can Skip the checks; For full groups, if the wide
1677 // load would wrap around the address space we would do a memory access at
1678 // nullptr even without the transformation.
1679 if (Group->isFull())
1680 continue;
1681
1682 // Case 2: If first and last members of the group don't wrap this implies
1683 // that all the pointers in the group don't wrap.
1684 // So we check only group member 0 (which is always guaranteed to exist),
1685 // and group member Factor - 1; If the latter doesn't exist we rely on
1686 // peeling (if it is a non-reversed access -- see Case 3).
1687 if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
1688 continue;
1689 if (Group->getMember(Group->getFactor() - 1))
1690 InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, "last");
1691 else {
1692 // Case 3: A non-reversed interleaved load group with gaps: We need
1693 // to execute at least one scalar epilogue iteration. This will ensure
1694 // we don't speculatively access memory out-of-bounds. We only need
1695 // to look for a member at index factor - 1, since every group must have
1696 // a member at index zero.
1697 if (Group->isReverse()) {
1698 LLVM_DEBUG(
1699 dbgs() << "LV: Invalidate candidate interleaved group due to "
1700 "a reverse access with gaps.\n");
1701 releaseGroup(Group);
1702 continue;
1703 }
1704 LLVM_DEBUG(
1705 dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
1706 RequiresScalarEpilogue = true;
1707 }
1708 }
1709
1710 for (auto *Group : StoreGroups) {
1711 // Case 1: A full group. Can Skip the checks; For full groups, if the wide
1712 // store would wrap around the address space we would do a memory access at
1713 // nullptr even without the transformation.
1714 if (Group->isFull())
1715 continue;
1716
1717 // Interleave-store-group with gaps is implemented using masked wide store.
1718 // Remove interleaved store groups with gaps if
1719 // masked-interleaved-accesses are not enabled by the target.
1720 if (!EnablePredicatedInterleavedMemAccesses) {
1721 LLVM_DEBUG(
1722 dbgs() << "LV: Invalidate candidate interleaved store group due "
1723 "to gaps.\n");
1724 releaseGroup(Group);
1725 continue;
1726 }
1727
1728 // Case 2: If first and last members of the group don't wrap this implies
1729 // that all the pointers in the group don't wrap.
1730 // So we check only group member 0 (which is always guaranteed to exist),
1731 // and the last group member. Case 3 (scalar epilog) is not relevant for
1732 // stores with gaps, which are implemented with masked-store (rather than
1733 // speculative access, as in loads).
1734 if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
1735 continue;
1736 for (int Index = Group->getFactor() - 1; Index > 0; Index--)
1737 if (Group->getMember(Index)) {
1738 InvalidateGroupIfMemberMayWrap(Group, Index, "last");
1739 break;
1740 }
1741 }
1742}
1743
1745 // If no group had triggered the requirement to create an epilogue loop,
1746 // there is nothing to do.
1748 return;
1749
1750 // Release groups requiring scalar epilogues. Note that this also removes them
1751 // from InterleaveGroups.
1752 bool ReleasedGroup = InterleaveGroups.remove_if([&](auto *Group) {
1753 if (!Group->requiresScalarEpilogue())
1754 return false;
1755 LLVM_DEBUG(
1756 dbgs()
1757 << "LV: Invalidate candidate interleaved group due to gaps that "
1758 "require a scalar epilogue (not allowed under optsize) and cannot "
1759 "be masked (not enabled). \n");
1760 releaseGroupWithoutRemovingFromSet(Group);
1761 return true;
1762 });
1763 assert(ReleasedGroup && "At least one group must be invalidated, as a "
1764 "scalar epilogue was required");
1765 (void)ReleasedGroup;
1766 RequiresScalarEpilogue = false;
1767}
1768
1769template <typename InstT>
1770void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
1771 llvm_unreachable("addMetadata can only be used for Instruction");
1772}
1773
1774namespace llvm {
1775template <>
1780} // namespace llvm
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static unsigned getScalarSizeInBits(Type *Ty)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static Value * concatenateTwoVectors(IRBuilderBase &Builder, Value *V1, Value *V2)
A helper function for concatenating vectors.
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
static void addToAccessGroupList(ListT &List, MDNode *AccGroups)
Add all access groups in AccGroups to List.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
iterator begin() const
Definition ArrayRef.h:130
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
LLVM Basic Block Representation.
Definition BasicBlock.h:62
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:174
EquivalenceClasses - This represents a collection of equivalence classes and supports three efficient...
iterator_range< member_iterator > members(const ECValue &ECV) const
const ElemTy & getOrInsertLeaderValue(const ElemTy &V)
getOrInsertLeaderValue - Return the leader for the specified value that is in the set.
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
union - Merge the two equivalence sets for the specified values, inserting them if they do not alread...
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This instruction inserts a single (scalar) element into a VectorType value.
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
This does the same thing as getAllMetadata, except that it filters out the debug location.
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isFull() const
Return true if this group is full, i.e. it has no gaps.
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
void setInsertPos(InstTy *Inst)
bool isReverse() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign)
Try to insert a new member Instr with index Index and alignment NewAlign.
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
LLVM_ABI void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Metadata node.
Definition Metadata.h:1078
static LLVM_ABI MDNode * getMostGenericAliasScope(MDNode *A, MDNode *B)
static LLVM_ABI MDNode * getMostGenericTBAA(MDNode *A, MDNode *B)
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDNode * getMostGenericFPMath(MDNode *A, MDNode *B)
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1448
static LLVM_ABI MDNode * intersect(MDNode *A, MDNode *B)
LLVMContext & getContext() const
Definition Metadata.h:1242
Tracking metadata reference owned by Metadata.
Definition Metadata.h:900
static LLVM_ABI MDNode * combine(LLVMContext &Ctx, const MMRAMetadata &A, const MMRAMetadata &B)
Combines A and B according to MMRA semantics.
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
reverse_iterator rend()
Definition MapVector.h:74
iterator find(const KeyT &Key)
Definition MapVector.h:154
bool empty() const
Definition MapVector.h:77
reverse_iterator rbegin()
Definition MapVector.h:70
Root of the metadata hierarchy.
Definition Metadata.h:64
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class represents a constant integer value.
const APInt & getAPInt() const
bool remove(const value_type &X)
Remove an item from the set vector.
Definition SetVector.h:181
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
ArrayRef< Type * > subtypes() const
Definition Type.h:365
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:233
static LLVM_ABI bool isVPCast(Intrinsic::ID ID)
static LLVM_ABI std::optional< unsigned > getVectorLengthParamPos(Intrinsic::ID IntrinsicID)
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
Base class of all SIMD vector types.
Type * getElementType() const
An efficient, type-erasing, non-owning reference to a callable.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI bool isTriviallyScalarizable(Intrinsic::ID ID, const TargetTransformInfo *TTI)
Identify if the intrinsic is trivially scalarizable.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
LLVM_ABI bool canInstructionHaveMMRAs(const Instruction &I)
LLVM_ABI APInt possiblyDemandedEltsInMask(Value *Mask)
Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y) for each lane which may be ...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI llvm::SmallVector< int, 16 > createUnaryMask(ArrayRef< int > Mask, unsigned NumElts)
Given a shuffle mask for a binary shuffle, create the equivalent shuffle mask assuming both operands ...
LLVM_ABI void getMetadataToPropagate(Instruction *Inst, SmallVectorImpl< std::pair< unsigned, MDNode * > > &Metadata)
Add metadata from Inst to Metadata, if it can be preserved after vectorization.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
LLVM_ABI MDNode * intersectAccessGroups(const Instruction *Inst1, const Instruction *Inst2)
Compute the access-group list of access groups that Inst1 and Inst2 are both in.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
constexpr unsigned MaxAnalysisRecursionDepth
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.interleaveN intrinsics.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI bool maskIsAllOneOrUndef(Value *Mask)
Given a mask vector of i1, Return true if all of the elements of this predicate mask are known to be ...
constexpr int PoisonMaskElem
LLVM_ABI bool isValidAsAccessGroup(MDNode *AccGroup)
Return whether an MDNode might represent an access group.
LLVM_ABI Intrinsic::ID getIntrinsicForCallSite(const CallBase &CB, const TargetLibraryInfo *TLI)
Map a call instruction to an intrinsic ID.
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
TargetTransformInfo TTI
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
LLVM_ABI MDNode * uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2)
Compute the union of two access-group lists.
unsigned M0(unsigned Val)
Definition VE.h:376
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1407
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
LLVM_ABI bool maskIsAllZeroOrUndef(Value *Mask)
Given a mask vector of i1, Return true if all of the elements of this predicate mask are known to be ...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
LLVM_ABI void getShuffleMaskWithWidestElts(ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Repetitively apply widenShuffleMaskElts() for as long as it succeeds, to get the shuffle mask with wi...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
LLVM_ABI bool maskContainsAllOneOrUndef(Value *Mask)
Given a mask vector of i1, Return true if any of the elements of this predicate mask are known to be ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872