LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoad(Instruction &I);
133 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
134 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeExtExtract(Instruction &I);
136 bool foldConcatOfBoolMasks(Instruction &I);
137 bool foldPermuteOfBinops(Instruction &I);
138 bool foldShuffleOfBinops(Instruction &I);
139 bool foldShuffleOfSelects(Instruction &I);
140 bool foldShuffleOfCastops(Instruction &I);
141 bool foldShuffleOfShuffles(Instruction &I);
142 bool foldShuffleOfIntrinsics(Instruction &I);
143 bool foldShuffleToIdentity(Instruction &I);
144 bool foldShuffleFromReductions(Instruction &I);
145 bool foldShuffleChainsToReduce(Instruction &I);
146 bool foldCastFromReductions(Instruction &I);
147 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
148 bool foldInterleaveIntrinsics(Instruction &I);
149 bool shrinkType(Instruction &I);
150 bool shrinkLoadForShuffles(Instruction &I);
151 bool shrinkPhiOfShuffles(Instruction &I);
152
153 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
154 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
155 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
156 Old.replaceAllUsesWith(&New);
157 if (auto *NewI = dyn_cast<Instruction>(&New)) {
158 New.takeName(&Old);
159 Worklist.pushUsersToWorkList(*NewI);
160 Worklist.pushValue(NewI);
161 }
162 if (Erase && isInstructionTriviallyDead(&Old)) {
163 eraseInstruction(Old);
164 } else {
165 Worklist.push(&Old);
166 }
167 }
168
169 void eraseInstruction(Instruction &I) {
170 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
171 SmallVector<Value *> Ops(I.operands());
172 Worklist.remove(&I);
173 I.eraseFromParent();
174
175 // Push remaining users of the operands and then the operand itself - allows
176 // further folds that were hindered by OneUse limits.
177 SmallPtrSet<Value *, 4> Visited;
178 for (Value *Op : Ops) {
179 if (!Visited.contains(Op)) {
180 if (auto *OpI = dyn_cast<Instruction>(Op)) {
182 OpI, nullptr, nullptr, [&](Value *V) {
183 if (auto *I = dyn_cast<Instruction>(V)) {
184 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
185 Worklist.remove(I);
186 if (I == NextInst)
187 NextInst = NextInst->getNextNode();
188 Visited.insert(I);
189 }
190 }))
191 continue;
192 Worklist.pushUsersToWorkList(*OpI);
193 Worklist.pushValue(OpI);
194 }
195 }
196 }
197 }
198};
199} // namespace
200
201/// Return the source operand of a potentially bitcasted value. If there is no
202/// bitcast, return the input value itself.
204 while (auto *BitCast = dyn_cast<BitCastInst>(V))
205 V = BitCast->getOperand(0);
206 return V;
207}
208
209static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
210 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
211 // The widened load may load data from dirty regions or create data races
212 // non-existent in the source.
213 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
214 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
216 return false;
217
218 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
219 // sure we have all of our type-based constraints in place for this target.
220 Type *ScalarTy = Load->getType()->getScalarType();
221 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
222 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
223 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
224 ScalarSize % 8 != 0)
225 return false;
226
227 return true;
228}
229
230bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
231 // Match insert into fixed vector of scalar value.
232 // TODO: Handle non-zero insert index.
233 Value *Scalar;
234 if (!match(&I,
236 return false;
237
238 // Optionally match an extract from another vector.
239 Value *X;
240 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
241 if (!HasExtract)
242 X = Scalar;
243
244 auto *Load = dyn_cast<LoadInst>(X);
245 if (!canWidenLoad(Load, TTI))
246 return false;
247
248 Type *ScalarTy = Scalar->getType();
249 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
250 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
251
252 // Check safety of replacing the scalar load with a larger vector load.
253 // We use minimal alignment (maximum flexibility) because we only care about
254 // the dereferenceable region. When calculating cost and creating a new op,
255 // we may use a larger value based on alignment attributes.
256 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
257 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
258
259 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
260 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
261 unsigned OffsetEltIndex = 0;
262 Align Alignment = Load->getAlign();
263 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
264 &DT)) {
265 // It is not safe to load directly from the pointer, but we can still peek
266 // through gep offsets and check if it safe to load from a base address with
267 // updated alignment. If it is, we can shuffle the element(s) into place
268 // after loading.
269 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
270 APInt Offset(OffsetBitWidth, 0);
272
273 // We want to shuffle the result down from a high element of a vector, so
274 // the offset must be positive.
275 if (Offset.isNegative())
276 return false;
277
278 // The offset must be a multiple of the scalar element to shuffle cleanly
279 // in the element's size.
280 uint64_t ScalarSizeInBytes = ScalarSize / 8;
281 if (Offset.urem(ScalarSizeInBytes) != 0)
282 return false;
283
284 // If we load MinVecNumElts, will our target element still be loaded?
285 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
286 if (OffsetEltIndex >= MinVecNumElts)
287 return false;
288
289 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
290 &DT))
291 return false;
292
293 // Update alignment with offset value. Note that the offset could be negated
294 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
295 // negation does not change the result of the alignment calculation.
296 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
297 }
298
299 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
300 // Use the greater of the alignment on the load or its source pointer.
301 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
302 Type *LoadTy = Load->getType();
303 unsigned AS = Load->getPointerAddressSpace();
304 InstructionCost OldCost =
305 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
306 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
307 OldCost +=
308 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
309 /* Insert */ true, HasExtract, CostKind);
310
311 // New pattern: load VecPtr
312 InstructionCost NewCost =
313 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
314 // Optionally, we are shuffling the loaded vector element(s) into place.
315 // For the mask set everything but element 0 to undef to prevent poison from
316 // propagating from the extra loaded memory. This will also optionally
317 // shrink/grow the vector from the loaded size to the output size.
318 // We assume this operation has no cost in codegen if there was no offset.
319 // Note that we could use freeze to avoid poison problems, but then we might
320 // still need a shuffle to change the vector size.
321 auto *Ty = cast<FixedVectorType>(I.getType());
322 unsigned OutputNumElts = Ty->getNumElements();
323 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
324 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
325 Mask[0] = OffsetEltIndex;
326 if (OffsetEltIndex)
327 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
328 CostKind);
329
330 // We can aggressively convert to the vector form because the backend can
331 // invert this transform if it does not result in a performance win.
332 if (OldCost < NewCost || !NewCost.isValid())
333 return false;
334
335 // It is safe and potentially profitable to load a vector directly:
336 // inselt undef, load Scalar, 0 --> load VecPtr
337 IRBuilder<> Builder(Load);
338 Value *CastedPtr =
339 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
340 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
341 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
342
343 replaceValue(I, *VecLd);
344 ++NumVecLoad;
345 return true;
346}
347
348/// If we are loading a vector and then inserting it into a larger vector with
349/// undefined elements, try to load the larger vector and eliminate the insert.
350/// This removes a shuffle in IR and may allow combining of other loaded values.
351bool VectorCombine::widenSubvectorLoad(Instruction &I) {
352 // Match subvector insert of fixed vector.
353 auto *Shuf = cast<ShuffleVectorInst>(&I);
354 if (!Shuf->isIdentityWithPadding())
355 return false;
356
357 // Allow a non-canonical shuffle mask that is choosing elements from op1.
358 unsigned NumOpElts =
359 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
360 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
361 return M >= (int)(NumOpElts);
362 });
363
364 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
365 if (!canWidenLoad(Load, TTI))
366 return false;
367
368 // We use minimal alignment (maximum flexibility) because we only care about
369 // the dereferenceable region. When calculating cost and creating a new op,
370 // we may use a larger value based on alignment attributes.
371 auto *Ty = cast<FixedVectorType>(I.getType());
372 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
373 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
374 Align Alignment = Load->getAlign();
375 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
376 return false;
377
378 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
379 Type *LoadTy = Load->getType();
380 unsigned AS = Load->getPointerAddressSpace();
381
382 // Original pattern: insert_subvector (load PtrOp)
383 // This conservatively assumes that the cost of a subvector insert into an
384 // undef value is 0. We could add that cost if the cost model accurately
385 // reflects the real cost of that operation.
386 InstructionCost OldCost =
387 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
388
389 // New pattern: load PtrOp
390 InstructionCost NewCost =
391 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
392
393 // We can aggressively convert to the vector form because the backend can
394 // invert this transform if it does not result in a performance win.
395 if (OldCost < NewCost || !NewCost.isValid())
396 return false;
397
398 IRBuilder<> Builder(Load);
399 Value *CastedPtr =
400 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
401 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
402 replaceValue(I, *VecLd);
403 ++NumVecLoad;
404 return true;
405}
406
407/// Determine which, if any, of the inputs should be replaced by a shuffle
408/// followed by extract from a different index.
409ExtractElementInst *VectorCombine::getShuffleExtract(
410 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
411 unsigned PreferredExtractIndex = InvalidIndex) const {
412 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
413 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
414 assert(Index0C && Index1C && "Expected constant extract indexes");
415
416 unsigned Index0 = Index0C->getZExtValue();
417 unsigned Index1 = Index1C->getZExtValue();
418
419 // If the extract indexes are identical, no shuffle is needed.
420 if (Index0 == Index1)
421 return nullptr;
422
423 Type *VecTy = Ext0->getVectorOperand()->getType();
424 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
425 InstructionCost Cost0 =
426 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
427 InstructionCost Cost1 =
428 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
429
430 // If both costs are invalid no shuffle is needed
431 if (!Cost0.isValid() && !Cost1.isValid())
432 return nullptr;
433
434 // We are extracting from 2 different indexes, so one operand must be shuffled
435 // before performing a vector operation and/or extract. The more expensive
436 // extract will be replaced by a shuffle.
437 if (Cost0 > Cost1)
438 return Ext0;
439 if (Cost1 > Cost0)
440 return Ext1;
441
442 // If the costs are equal and there is a preferred extract index, shuffle the
443 // opposite operand.
444 if (PreferredExtractIndex == Index0)
445 return Ext1;
446 if (PreferredExtractIndex == Index1)
447 return Ext0;
448
449 // Otherwise, replace the extract with the higher index.
450 return Index0 > Index1 ? Ext0 : Ext1;
451}
452
453/// Compare the relative costs of 2 extracts followed by scalar operation vs.
454/// vector operation(s) followed by extract. Return true if the existing
455/// instructions are cheaper than a vector alternative. Otherwise, return false
456/// and if one of the extracts should be transformed to a shufflevector, set
457/// \p ConvertToShuffle to that extract instruction.
458bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
459 ExtractElementInst *Ext1,
460 const Instruction &I,
461 ExtractElementInst *&ConvertToShuffle,
462 unsigned PreferredExtractIndex) {
463 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
464 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
465 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
466
467 unsigned Opcode = I.getOpcode();
468 Value *Ext0Src = Ext0->getVectorOperand();
469 Value *Ext1Src = Ext1->getVectorOperand();
470 Type *ScalarTy = Ext0->getType();
471 auto *VecTy = cast<VectorType>(Ext0Src->getType());
472 InstructionCost ScalarOpCost, VectorOpCost;
473
474 // Get cost estimates for scalar and vector versions of the operation.
475 bool IsBinOp = Instruction::isBinaryOp(Opcode);
476 if (IsBinOp) {
477 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
478 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
479 } else {
480 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
481 "Expected a compare");
482 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
483 ScalarOpCost = TTI.getCmpSelInstrCost(
484 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
485 VectorOpCost = TTI.getCmpSelInstrCost(
486 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
487 }
488
489 // Get cost estimates for the extract elements. These costs will factor into
490 // both sequences.
491 unsigned Ext0Index = Ext0IndexC->getZExtValue();
492 unsigned Ext1Index = Ext1IndexC->getZExtValue();
493
494 InstructionCost Extract0Cost =
495 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
496 InstructionCost Extract1Cost =
497 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
498
499 // A more expensive extract will always be replaced by a splat shuffle.
500 // For example, if Ext0 is more expensive:
501 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
502 // extelt (opcode (splat V0, Ext0), V1), Ext1
503 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
504 // check the cost of creating a broadcast shuffle and shuffling both
505 // operands to element 0.
506 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
507 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
508 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
509
510 // Extra uses of the extracts mean that we include those costs in the
511 // vector total because those instructions will not be eliminated.
512 InstructionCost OldCost, NewCost;
513 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
514 // Handle a special case. If the 2 extracts are identical, adjust the
515 // formulas to account for that. The extra use charge allows for either the
516 // CSE'd pattern or an unoptimized form with identical values:
517 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
518 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
519 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
520 OldCost = CheapExtractCost + ScalarOpCost;
521 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
522 } else {
523 // Handle the general case. Each extract is actually a different value:
524 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
525 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
526 NewCost = VectorOpCost + CheapExtractCost +
527 !Ext0->hasOneUse() * Extract0Cost +
528 !Ext1->hasOneUse() * Extract1Cost;
529 }
530
531 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
532 if (ConvertToShuffle) {
533 if (IsBinOp && DisableBinopExtractShuffle)
534 return true;
535
536 // If we are extracting from 2 different indexes, then one operand must be
537 // shuffled before performing the vector operation. The shuffle mask is
538 // poison except for 1 lane that is being translated to the remaining
539 // extraction lane. Therefore, it is a splat shuffle. Ex:
540 // ShufMask = { poison, poison, 0, poison }
541 // TODO: The cost model has an option for a "broadcast" shuffle
542 // (splat-from-element-0), but no option for a more general splat.
543 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
544 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
546 ShuffleMask[BestInsIndex] = BestExtIndex;
548 VecTy, VecTy, ShuffleMask, CostKind, 0,
549 nullptr, {ConvertToShuffle});
550 } else {
552 VecTy, VecTy, {}, CostKind, 0, nullptr,
553 {ConvertToShuffle});
554 }
555 }
556
557 // Aggressively form a vector op if the cost is equal because the transform
558 // may enable further optimization.
559 // Codegen can reverse this transform (scalarize) if it was not profitable.
560 return OldCost < NewCost;
561}
562
563/// Create a shuffle that translates (shifts) 1 element from the input vector
564/// to a new element location.
565static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
566 unsigned NewIndex, IRBuilderBase &Builder) {
567 // The shuffle mask is poison except for 1 lane that is being translated
568 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
569 // ShufMask = { 2, poison, poison, poison }
570 auto *VecTy = cast<FixedVectorType>(Vec->getType());
571 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
572 ShufMask[NewIndex] = OldIndex;
573 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
574}
575
576/// Given an extract element instruction with constant index operand, shuffle
577/// the source vector (shift the scalar element) to a NewIndex for extraction.
578/// Return null if the input can be constant folded, so that we are not creating
579/// unnecessary instructions.
580static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
581 IRBuilderBase &Builder) {
582 // Shufflevectors can only be created for fixed-width vectors.
583 Value *X = ExtElt->getVectorOperand();
584 if (!isa<FixedVectorType>(X->getType()))
585 return nullptr;
586
587 // If the extract can be constant-folded, this code is unsimplified. Defer
588 // to other passes to handle that.
589 Value *C = ExtElt->getIndexOperand();
590 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
591 if (isa<Constant>(X))
592 return nullptr;
593
594 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
595 NewIndex, Builder);
596 return Shuf;
597}
598
599/// Try to reduce extract element costs by converting scalar compares to vector
600/// compares followed by extract.
601/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
602Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
603 Instruction &I) {
604 assert(isa<CmpInst>(&I) && "Expected a compare");
605
606 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
607 // --> extelt (cmp Pred V0, V1), ExtIndex
608 ++NumVecCmp;
609 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
610 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
611 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
612}
613
614/// Try to reduce extract element costs by converting scalar binops to vector
615/// binops followed by extract.
616/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
617Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
618 Instruction &I) {
619 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
620
621 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
622 // --> extelt (bo V0, V1), ExtIndex
623 ++NumVecBO;
624 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
625 V1, "foldExtExtBinop");
626
627 // All IR flags are safe to back-propagate because any potential poison
628 // created in unused vector elements is discarded by the extract.
629 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
630 VecBOInst->copyIRFlags(&I);
631
632 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
633}
634
635/// Match an instruction with extracted vector operands.
636bool VectorCombine::foldExtractExtract(Instruction &I) {
637 // It is not safe to transform things like div, urem, etc. because we may
638 // create undefined behavior when executing those on unknown vector elements.
640 return false;
641
642 Instruction *I0, *I1;
643 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
644 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
646 return false;
647
648 Value *V0, *V1;
649 uint64_t C0, C1;
650 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
651 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
652 V0->getType() != V1->getType())
653 return false;
654
655 // If the scalar value 'I' is going to be re-inserted into a vector, then try
656 // to create an extract to that same element. The extract/insert can be
657 // reduced to a "select shuffle".
658 // TODO: If we add a larger pattern match that starts from an insert, this
659 // probably becomes unnecessary.
660 auto *Ext0 = cast<ExtractElementInst>(I0);
661 auto *Ext1 = cast<ExtractElementInst>(I1);
662 uint64_t InsertIndex = InvalidIndex;
663 if (I.hasOneUse())
664 match(I.user_back(),
665 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
666
667 ExtractElementInst *ExtractToChange;
668 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
669 return false;
670
671 Value *ExtOp0 = Ext0->getVectorOperand();
672 Value *ExtOp1 = Ext1->getVectorOperand();
673
674 if (ExtractToChange) {
675 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
676 Value *NewExtOp =
677 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
678 if (!NewExtOp)
679 return false;
680 if (ExtractToChange == Ext0)
681 ExtOp0 = NewExtOp;
682 else
683 ExtOp1 = NewExtOp;
684 }
685
686 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
687 : Ext0->getIndexOperand();
688 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
689 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
690 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
691 Worklist.push(Ext0);
692 Worklist.push(Ext1);
693 replaceValue(I, *NewExt);
694 return true;
695}
696
697/// Try to replace an extract + scalar fneg + insert with a vector fneg +
698/// shuffle.
699bool VectorCombine::foldInsExtFNeg(Instruction &I) {
700 // Match an insert (op (extract)) pattern.
701 Value *DstVec;
702 uint64_t ExtIdx, InsIdx;
703 Instruction *FNeg;
704 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
705 m_ConstantInt(InsIdx))))
706 return false;
707
708 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
709 Value *SrcVec;
710 Instruction *Extract;
711 if (!match(FNeg, m_FNeg(m_CombineAnd(
712 m_Instruction(Extract),
713 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
714 return false;
715
716 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
717 auto *DstVecScalarTy = DstVecTy->getScalarType();
718 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
719 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
720 return false;
721
722 // Ignore if insert/extract index is out of bounds or destination vector has
723 // one element
724 unsigned NumDstElts = DstVecTy->getNumElements();
725 unsigned NumSrcElts = SrcVecTy->getNumElements();
726 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
727 return false;
728
729 // We are inserting the negated element into the same lane that we extracted
730 // from. This is equivalent to a select-shuffle that chooses all but the
731 // negated element from the destination vector.
732 SmallVector<int> Mask(NumDstElts);
733 std::iota(Mask.begin(), Mask.end(), 0);
734 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
735 InstructionCost OldCost =
736 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
737 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
738
739 // If the extract has one use, it will be eliminated, so count it in the
740 // original cost. If it has more than one use, ignore the cost because it will
741 // be the same before/after.
742 if (Extract->hasOneUse())
743 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
744
745 InstructionCost NewCost =
746 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
748 DstVecTy, Mask, CostKind);
749
750 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
751 // If the lengths of the two vectors are not equal,
752 // we need to add a length-change vector. Add this cost.
753 SmallVector<int> SrcMask;
754 if (NeedLenChg) {
755 SrcMask.assign(NumDstElts, PoisonMaskElem);
756 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
758 DstVecTy, SrcVecTy, SrcMask, CostKind);
759 }
760
761 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
762 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
763 << "\n");
764 if (NewCost > OldCost)
765 return false;
766
767 Value *NewShuf, *LenChgShuf = nullptr;
768 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
769 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
770 if (NeedLenChg) {
771 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
772 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
773 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
774 Worklist.pushValue(LenChgShuf);
775 } else {
776 // shuffle DstVec, (fneg SrcVec), Mask
777 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
778 }
779
780 Worklist.pushValue(VecFNeg);
781 replaceValue(I, *NewShuf);
782 return true;
783}
784
785/// Try to fold insert(binop(x,y),binop(a,b),idx)
786/// --> binop(insert(x,a,idx),insert(y,b,idx))
787bool VectorCombine::foldInsExtBinop(Instruction &I) {
788 BinaryOperator *VecBinOp, *SclBinOp;
789 uint64_t Index;
790 if (!match(&I,
791 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
792 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
793 return false;
794
795 // TODO: Add support for addlike etc.
796 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
797 if (BinOpcode != SclBinOp->getOpcode())
798 return false;
799
800 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
801 if (!ResultTy)
802 return false;
803
804 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
805 // shuffle?
806
808 TTI.getInstructionCost(VecBinOp, CostKind) +
810 InstructionCost NewCost =
811 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
812 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
813 Index, VecBinOp->getOperand(0),
814 SclBinOp->getOperand(0)) +
815 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
816 Index, VecBinOp->getOperand(1),
817 SclBinOp->getOperand(1));
818
819 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
820 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
821 << "\n");
822 if (NewCost > OldCost)
823 return false;
824
825 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
826 SclBinOp->getOperand(0), Index);
827 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
828 SclBinOp->getOperand(1), Index);
829 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
830
831 // Intersect flags from the old binops.
832 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
833 NewInst->copyIRFlags(VecBinOp);
834 NewInst->andIRFlags(SclBinOp);
835 }
836
837 Worklist.pushValue(NewIns0);
838 Worklist.pushValue(NewIns1);
839 replaceValue(I, *NewBO);
840 return true;
841}
842
843/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
844/// Supports: bitcast, trunc, sext, zext
845bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
846 // Check if this is a bitwise logic operation
847 auto *BinOp = dyn_cast<BinaryOperator>(&I);
848 if (!BinOp || !BinOp->isBitwiseLogicOp())
849 return false;
850
851 // Get the cast instructions
852 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
853 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
854 if (!LHSCast || !RHSCast) {
855 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
856 return false;
857 }
858
859 // Both casts must be the same type
860 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
861 if (CastOpcode != RHSCast->getOpcode())
862 return false;
863
864 // Only handle supported cast operations
865 switch (CastOpcode) {
866 case Instruction::BitCast:
867 case Instruction::Trunc:
868 case Instruction::SExt:
869 case Instruction::ZExt:
870 break;
871 default:
872 return false;
873 }
874
875 Value *LHSSrc = LHSCast->getOperand(0);
876 Value *RHSSrc = RHSCast->getOperand(0);
877
878 // Source types must match
879 if (LHSSrc->getType() != RHSSrc->getType())
880 return false;
881
882 auto *SrcTy = LHSSrc->getType();
883 auto *DstTy = I.getType();
884 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
885 // Other casts only handle vector types with integer elements.
886 if (CastOpcode != Instruction::BitCast &&
887 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
888 return false;
889
890 // Only integer scalar/vector values are legal for bitwise logic operations.
891 if (!SrcTy->getScalarType()->isIntegerTy() ||
892 !DstTy->getScalarType()->isIntegerTy())
893 return false;
894
895 // Cost Check :
896 // OldCost = bitlogic + 2*casts
897 // NewCost = bitlogic + cast
898
899 // Calculate specific costs for each cast with instruction context
901 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
903 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
904
905 InstructionCost OldCost =
906 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
907 LHSCastCost + RHSCastCost;
908
909 // For new cost, we can't provide an instruction (it doesn't exist yet)
910 InstructionCost GenericCastCost = TTI.getCastInstrCost(
911 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
912
913 InstructionCost NewCost =
914 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
915 GenericCastCost;
916
917 // Account for multi-use casts using specific costs
918 if (!LHSCast->hasOneUse())
919 NewCost += LHSCastCost;
920 if (!RHSCast->hasOneUse())
921 NewCost += RHSCastCost;
922
923 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
924 << " NewCost=" << NewCost << "\n");
925
926 if (NewCost > OldCost)
927 return false;
928
929 // Create the operation on the source type
930 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
931 BinOp->getName() + ".inner");
932 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
933 NewBinOp->copyIRFlags(BinOp);
934
935 Worklist.pushValue(NewOp);
936
937 // Create the cast operation directly to ensure we get a new instruction
938 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
939
940 // Preserve cast instruction flags
941 NewCast->copyIRFlags(LHSCast);
942 NewCast->andIRFlags(RHSCast);
943
944 // Insert the new instruction
945 Value *Result = Builder.Insert(NewCast);
946
947 replaceValue(I, *Result);
948 return true;
949}
950
951/// Match:
952// bitop(castop(x), C) ->
953// bitop(castop(x), castop(InvC)) ->
954// castop(bitop(x, InvC))
955// Supports: bitcast
956bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
958 Constant *C;
959
960 // Check if this is a bitwise logic operation
962 return false;
963
964 // Get the cast instructions
965 auto *LHSCast = dyn_cast<CastInst>(LHS);
966 if (!LHSCast)
967 return false;
968
969 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
970
971 // Only handle supported cast operations
972 switch (CastOpcode) {
973 case Instruction::BitCast:
974 case Instruction::ZExt:
975 case Instruction::SExt:
976 case Instruction::Trunc:
977 break;
978 default:
979 return false;
980 }
981
982 Value *LHSSrc = LHSCast->getOperand(0);
983
984 auto *SrcTy = LHSSrc->getType();
985 auto *DstTy = I.getType();
986 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
987 // Other casts only handle vector types with integer elements.
988 if (CastOpcode != Instruction::BitCast &&
989 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
990 return false;
991
992 // Only integer scalar/vector values are legal for bitwise logic operations.
993 if (!SrcTy->getScalarType()->isIntegerTy() ||
994 !DstTy->getScalarType()->isIntegerTy())
995 return false;
996
997 // Find the constant InvC, such that castop(InvC) equals to C.
998 PreservedCastFlags RHSFlags;
999 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1000 if (!InvC)
1001 return false;
1002
1003 // Cost Check :
1004 // OldCost = bitlogic + cast
1005 // NewCost = bitlogic + cast
1006
1007 // Calculate specific costs for each cast with instruction context
1008 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1009 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1010
1011 InstructionCost OldCost =
1012 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1013
1014 // For new cost, we can't provide an instruction (it doesn't exist yet)
1015 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1016 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1017
1018 InstructionCost NewCost =
1019 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1020 GenericCastCost;
1021
1022 // Account for multi-use casts using specific costs
1023 if (!LHSCast->hasOneUse())
1024 NewCost += LHSCastCost;
1025
1026 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1027 << " NewCost=" << NewCost << "\n");
1028
1029 if (NewCost > OldCost)
1030 return false;
1031
1032 // Create the operation on the source type
1033 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1034 LHSSrc, InvC, I.getName() + ".inner");
1035 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1036 NewBinOp->copyIRFlags(&I);
1037
1038 Worklist.pushValue(NewOp);
1039
1040 // Create the cast operation directly to ensure we get a new instruction
1041 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1042
1043 // Preserve cast instruction flags
1044 if (RHSFlags.NNeg)
1045 NewCast->setNonNeg();
1046 if (RHSFlags.NUW)
1047 NewCast->setHasNoUnsignedWrap();
1048 if (RHSFlags.NSW)
1049 NewCast->setHasNoSignedWrap();
1050
1051 NewCast->andIRFlags(LHSCast);
1052
1053 // Insert the new instruction
1054 Value *Result = Builder.Insert(NewCast);
1055
1056 replaceValue(I, *Result);
1057 return true;
1058}
1059
1060/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1061/// destination type followed by shuffle. This can enable further transforms by
1062/// moving bitcasts or shuffles together.
1063bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1064 Value *V0, *V1;
1065 ArrayRef<int> Mask;
1066 if (!match(&I, m_BitCast(m_OneUse(
1067 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1068 return false;
1069
1070 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1071 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1072 // mask for scalable type is a splat or not.
1073 // 2) Disallow non-vector casts.
1074 // TODO: We could allow any shuffle.
1075 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1076 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1077 if (!DestTy || !SrcTy)
1078 return false;
1079
1080 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1081 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1082 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1083 return false;
1084
1085 bool IsUnary = isa<UndefValue>(V1);
1086
1087 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1088 // if it won't increase the number of bitcasts.
1089 if (!IsUnary) {
1092 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1093 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1094 return false;
1095 }
1096
1097 SmallVector<int, 16> NewMask;
1098 if (DestEltSize <= SrcEltSize) {
1099 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1100 // always be expanded to the equivalent form choosing narrower elements.
1101 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1102 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1103 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1104 } else {
1105 // The bitcast is from narrow elements to wide elements. The shuffle mask
1106 // must choose consecutive elements to allow casting first.
1107 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1108 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1109 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1110 return false;
1111 }
1112
1113 // Bitcast the shuffle src - keep its original width but using the destination
1114 // scalar type.
1115 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1116 auto *NewShuffleTy =
1117 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1118 auto *OldShuffleTy =
1119 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1120 unsigned NumOps = IsUnary ? 1 : 2;
1121
1122 // The new shuffle must not cost more than the old shuffle.
1126
1127 InstructionCost NewCost =
1128 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1129 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1130 TargetTransformInfo::CastContextHint::None,
1131 CostKind));
1132 InstructionCost OldCost =
1133 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1134 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1135 TargetTransformInfo::CastContextHint::None,
1136 CostKind);
1137
1138 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1139 << OldCost << " vs NewCost: " << NewCost << "\n");
1140
1141 if (NewCost > OldCost || !NewCost.isValid())
1142 return false;
1143
1144 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1145 ++NumShufOfBitcast;
1146 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1147 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1148 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1149 replaceValue(I, *Shuf);
1150 return true;
1151}
1152
1153/// VP Intrinsics whose vector operands are both splat values may be simplified
1154/// into the scalar version of the operation and the result splatted. This
1155/// can lead to scalarization down the line.
1156bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1157 if (!isa<VPIntrinsic>(I))
1158 return false;
1159 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1160 Value *Op0 = VPI.getArgOperand(0);
1161 Value *Op1 = VPI.getArgOperand(1);
1162
1163 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1164 return false;
1165
1166 // Check getSplatValue early in this function, to avoid doing unnecessary
1167 // work.
1168 Value *ScalarOp0 = getSplatValue(Op0);
1169 Value *ScalarOp1 = getSplatValue(Op1);
1170 if (!ScalarOp0 || !ScalarOp1)
1171 return false;
1172
1173 // For the binary VP intrinsics supported here, the result on disabled lanes
1174 // is a poison value. For now, only do this simplification if all lanes
1175 // are active.
1176 // TODO: Relax the condition that all lanes are active by using insertelement
1177 // on inactive lanes.
1178 auto IsAllTrueMask = [](Value *MaskVal) {
1179 if (Value *SplattedVal = getSplatValue(MaskVal))
1180 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1181 return ConstValue->isAllOnesValue();
1182 return false;
1183 };
1184 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1185 return false;
1186
1187 // Check to make sure we support scalarization of the intrinsic
1188 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1189 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1190 return false;
1191
1192 // Calculate cost of splatting both operands into vectors and the vector
1193 // intrinsic
1194 VectorType *VecTy = cast<VectorType>(VPI.getType());
1195 SmallVector<int> Mask;
1196 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1197 Mask.resize(FVTy->getNumElements(), 0);
1198 InstructionCost SplatCost =
1199 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1201 CostKind);
1202
1203 // Calculate the cost of the VP Intrinsic
1205 for (Value *V : VPI.args())
1206 Args.push_back(V->getType());
1207 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1208 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1209 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1210
1211 // Determine scalar opcode
1212 std::optional<unsigned> FunctionalOpcode =
1213 VPI.getFunctionalOpcode();
1214 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1215 if (!FunctionalOpcode) {
1216 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1217 if (!ScalarIntrID)
1218 return false;
1219 }
1220
1221 // Calculate cost of scalarizing
1222 InstructionCost ScalarOpCost = 0;
1223 if (ScalarIntrID) {
1224 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1225 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1226 } else {
1227 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1228 VecTy->getScalarType(), CostKind);
1229 }
1230
1231 // The existing splats may be kept around if other instructions use them.
1232 InstructionCost CostToKeepSplats =
1233 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1234 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1235
1236 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1237 << "\n");
1238 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1239 << ", Cost of scalarizing:" << NewCost << "\n");
1240
1241 // We want to scalarize unless the vector variant actually has lower cost.
1242 if (OldCost < NewCost || !NewCost.isValid())
1243 return false;
1244
1245 // Scalarize the intrinsic
1246 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1247 Value *EVL = VPI.getArgOperand(3);
1248
1249 // If the VP op might introduce UB or poison, we can scalarize it provided
1250 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1251 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1252 // scalarizing it.
1253 bool SafeToSpeculate;
1254 if (ScalarIntrID)
1255 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1256 .hasAttribute(Attribute::AttrKind::Speculatable);
1257 else
1259 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1260 if (!SafeToSpeculate &&
1261 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1262 return false;
1263
1264 Value *ScalarVal =
1265 ScalarIntrID
1266 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1267 {ScalarOp0, ScalarOp1})
1268 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1269 ScalarOp0, ScalarOp1);
1270
1271 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1272 return true;
1273}
1274
1275/// Match a vector op/compare/intrinsic with at least one
1276/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1277/// by insertelement.
1278bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1279 auto *UO = dyn_cast<UnaryOperator>(&I);
1280 auto *BO = dyn_cast<BinaryOperator>(&I);
1281 auto *CI = dyn_cast<CmpInst>(&I);
1282 auto *II = dyn_cast<IntrinsicInst>(&I);
1283 if (!UO && !BO && !CI && !II)
1284 return false;
1285
1286 // TODO: Allow intrinsics with different argument types
1287 if (II) {
1288 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1289 return false;
1290 for (auto [Idx, Arg] : enumerate(II->args()))
1291 if (Arg->getType() != II->getType() &&
1292 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1293 return false;
1294 }
1295
1296 // Do not convert the vector condition of a vector select into a scalar
1297 // condition. That may cause problems for codegen because of differences in
1298 // boolean formats and register-file transfers.
1299 // TODO: Can we account for that in the cost model?
1300 if (CI)
1301 for (User *U : I.users())
1302 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1303 return false;
1304
1305 // Match constant vectors or scalars being inserted into constant vectors:
1306 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1307 SmallVector<Value *> VecCs, ScalarOps;
1308 std::optional<uint64_t> Index;
1309
1310 auto Ops = II ? II->args() : I.operands();
1311 for (auto [OpNum, Op] : enumerate(Ops)) {
1312 Constant *VecC;
1313 Value *V;
1314 uint64_t InsIdx = 0;
1315 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1316 m_ConstantInt(InsIdx)))) {
1317 // Bail if any inserts are out of bounds.
1318 VectorType *OpTy = cast<VectorType>(Op->getType());
1319 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1320 return false;
1321 // All inserts must have the same index.
1322 // TODO: Deal with mismatched index constants and variable indexes?
1323 if (!Index)
1324 Index = InsIdx;
1325 else if (InsIdx != *Index)
1326 return false;
1327 VecCs.push_back(VecC);
1328 ScalarOps.push_back(V);
1329 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1330 OpNum, &TTI)) {
1331 VecCs.push_back(Op.get());
1332 ScalarOps.push_back(Op.get());
1333 } else if (match(Op.get(), m_Constant(VecC))) {
1334 VecCs.push_back(VecC);
1335 ScalarOps.push_back(nullptr);
1336 } else {
1337 return false;
1338 }
1339 }
1340
1341 // Bail if all operands are constant.
1342 if (!Index.has_value())
1343 return false;
1344
1345 VectorType *VecTy = cast<VectorType>(I.getType());
1346 Type *ScalarTy = VecTy->getScalarType();
1347 assert(VecTy->isVectorTy() &&
1348 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1349 ScalarTy->isPointerTy()) &&
1350 "Unexpected types for insert element into binop or cmp");
1351
1352 unsigned Opcode = I.getOpcode();
1353 InstructionCost ScalarOpCost, VectorOpCost;
1354 if (CI) {
1355 CmpInst::Predicate Pred = CI->getPredicate();
1356 ScalarOpCost = TTI.getCmpSelInstrCost(
1357 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1358 VectorOpCost = TTI.getCmpSelInstrCost(
1359 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1360 } else if (UO || BO) {
1361 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1362 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1363 } else {
1364 IntrinsicCostAttributes ScalarICA(
1365 II->getIntrinsicID(), ScalarTy,
1366 SmallVector<Type *>(II->arg_size(), ScalarTy));
1367 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1368 IntrinsicCostAttributes VectorICA(
1369 II->getIntrinsicID(), VecTy,
1370 SmallVector<Type *>(II->arg_size(), VecTy));
1371 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1372 }
1373
1374 // Fold the vector constants in the original vectors into a new base vector to
1375 // get more accurate cost modelling.
1376 Value *NewVecC = nullptr;
1377 if (CI)
1378 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1379 else if (UO)
1380 NewVecC =
1381 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1382 else if (BO)
1383 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1384 else if (II)
1385 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1386
1387 if (!NewVecC)
1388 return false;
1389
1390 // Get cost estimate for the insert element. This cost will factor into
1391 // both sequences.
1392 InstructionCost OldCost = VectorOpCost;
1393 InstructionCost NewCost =
1394 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1395 CostKind, *Index, NewVecC);
1396
1397 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1398 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1399 II->getIntrinsicID(), Idx, &TTI)))
1400 continue;
1402 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1403 OldCost += InsertCost;
1404 NewCost += !Op->hasOneUse() * InsertCost;
1405 }
1406
1407 // We want to scalarize unless the vector variant actually has lower cost.
1408 if (OldCost < NewCost || !NewCost.isValid())
1409 return false;
1410
1411 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1412 // inselt NewVecC, (scalar_op V0, V1), Index
1413 if (CI)
1414 ++NumScalarCmp;
1415 else if (UO || BO)
1416 ++NumScalarOps;
1417 else
1418 ++NumScalarIntrinsic;
1419
1420 // For constant cases, extract the scalar element, this should constant fold.
1421 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1422 if (!Scalar)
1424 cast<Constant>(VecC), Builder.getInt64(*Index));
1425
1426 Value *Scalar;
1427 if (CI)
1428 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1429 else if (UO || BO)
1430 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1431 else
1432 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1433
1434 Scalar->setName(I.getName() + ".scalar");
1435
1436 // All IR flags are safe to back-propagate. There is no potential for extra
1437 // poison to be created by the scalar instruction.
1438 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1439 ScalarInst->copyIRFlags(&I);
1440
1441 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1442 replaceValue(I, *Insert);
1443 return true;
1444}
1445
1446/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1447/// a vector into vector operations followed by extract. Note: The SLP pass
1448/// may miss this pattern because of implementation problems.
1449bool VectorCombine::foldExtractedCmps(Instruction &I) {
1450 auto *BI = dyn_cast<BinaryOperator>(&I);
1451
1452 // We are looking for a scalar binop of booleans.
1453 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1454 if (!BI || !I.getType()->isIntegerTy(1))
1455 return false;
1456
1457 // The compare predicates should match, and each compare should have a
1458 // constant operand.
1459 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1460 Instruction *I0, *I1;
1461 Constant *C0, *C1;
1462 CmpPredicate P0, P1;
1463 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1464 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1465 return false;
1466
1467 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1468 if (!MatchingPred)
1469 return false;
1470
1471 // The compare operands must be extracts of the same vector with constant
1472 // extract indexes.
1473 Value *X;
1474 uint64_t Index0, Index1;
1475 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1476 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1477 return false;
1478
1479 auto *Ext0 = cast<ExtractElementInst>(I0);
1480 auto *Ext1 = cast<ExtractElementInst>(I1);
1481 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1482 if (!ConvertToShuf)
1483 return false;
1484 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1485 "Unknown ExtractElementInst");
1486
1487 // The original scalar pattern is:
1488 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1489 CmpInst::Predicate Pred = *MatchingPred;
1490 unsigned CmpOpcode =
1491 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1492 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1493 if (!VecTy)
1494 return false;
1495
1496 InstructionCost Ext0Cost =
1497 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1498 InstructionCost Ext1Cost =
1499 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1501 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1502 CostKind);
1503
1504 InstructionCost OldCost =
1505 Ext0Cost + Ext1Cost + CmpCost * 2 +
1506 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1507
1508 // The proposed vector pattern is:
1509 // vcmp = cmp Pred X, VecC
1510 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1511 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1512 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1515 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1516 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1517 ShufMask[CheapIndex] = ExpensiveIndex;
1519 CmpTy, ShufMask, CostKind);
1520 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1521 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1522 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1523 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1524
1525 // Aggressively form vector ops if the cost is equal because the transform
1526 // may enable further optimization.
1527 // Codegen can reverse this transform (scalarize) if it was not profitable.
1528 if (OldCost < NewCost || !NewCost.isValid())
1529 return false;
1530
1531 // Create a vector constant from the 2 scalar constants.
1532 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1533 PoisonValue::get(VecTy->getElementType()));
1534 CmpC[Index0] = C0;
1535 CmpC[Index1] = C1;
1536 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1537 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1538 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1539 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1540 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1541 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1542 replaceValue(I, *NewExt);
1543 ++NumVecCmpBO;
1544 return true;
1545}
1546
1549 const TargetTransformInfo &TTI,
1550 InstructionCost &CostBeforeReduction,
1551 InstructionCost &CostAfterReduction) {
1552 Instruction *Op0, *Op1;
1553 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1554 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1555 unsigned ReductionOpc =
1556 getArithmeticReductionInstruction(II.getIntrinsicID());
1557 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1558 bool IsUnsigned = isa<ZExtInst>(RedOp);
1559 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1560
1561 CostBeforeReduction =
1562 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1564 CostAfterReduction =
1565 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1566 ExtType, FastMathFlags(), CostKind);
1567 return;
1568 }
1569 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1570 match(RedOp,
1572 match(Op0, m_ZExtOrSExt(m_Value())) &&
1573 Op0->getOpcode() == Op1->getOpcode() &&
1574 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1575 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1576 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1577 bool IsUnsigned = isa<ZExtInst>(Op0);
1578 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1579 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1580
1581 InstructionCost ExtCost =
1582 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1584 InstructionCost MulCost =
1585 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1586 InstructionCost Ext2Cost =
1587 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1589
1590 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1591 CostAfterReduction = TTI.getMulAccReductionCost(
1592 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1593 return;
1594 }
1595 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1596 std::nullopt, CostKind);
1597}
1598
1599bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1600 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1601 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1602 if (BinOpOpc == Instruction::Sub)
1603 ReductionIID = Intrinsic::vector_reduce_add;
1604 if (ReductionIID == Intrinsic::not_intrinsic)
1605 return false;
1606
1607 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1608 Intrinsic::ID IID) -> Value * {
1609 auto *II = dyn_cast<IntrinsicInst>(V);
1610 if (!II)
1611 return nullptr;
1612 if (II->getIntrinsicID() == IID && II->hasOneUse())
1613 return II->getArgOperand(0);
1614 return nullptr;
1615 };
1616
1617 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1618 if (!V0)
1619 return false;
1620 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1621 if (!V1)
1622 return false;
1623
1624 auto *VTy = cast<VectorType>(V0->getType());
1625 if (V1->getType() != VTy)
1626 return false;
1627 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1628 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1629 unsigned ReductionOpc =
1630 getArithmeticReductionInstruction(II0.getIntrinsicID());
1631
1632 InstructionCost OldCost = 0;
1633 InstructionCost NewCost = 0;
1634 InstructionCost CostOfRedOperand0 = 0;
1635 InstructionCost CostOfRed0 = 0;
1636 InstructionCost CostOfRedOperand1 = 0;
1637 InstructionCost CostOfRed1 = 0;
1638 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1639 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1640 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1641 NewCost =
1642 CostOfRedOperand0 + CostOfRedOperand1 +
1643 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1644 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1645 if (NewCost >= OldCost || !NewCost.isValid())
1646 return false;
1647
1648 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1649 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1650 << "\n");
1651 Value *VectorBO;
1652 if (BinOpOpc == Instruction::Or)
1653 VectorBO = Builder.CreateOr(V0, V1, "",
1654 cast<PossiblyDisjointInst>(I).isDisjoint());
1655 else
1656 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1657
1658 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1659 replaceValue(I, *Rdx);
1660 return true;
1661}
1662
1663// Check if memory loc modified between two instrs in the same BB
1666 const MemoryLocation &Loc, AAResults &AA) {
1667 unsigned NumScanned = 0;
1668 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1669 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1670 ++NumScanned > MaxInstrsToScan;
1671 });
1672}
1673
1674namespace {
1675/// Helper class to indicate whether a vector index can be safely scalarized and
1676/// if a freeze needs to be inserted.
1677class ScalarizationResult {
1678 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1679
1680 StatusTy Status;
1681 Value *ToFreeze;
1682
1683 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1684 : Status(Status), ToFreeze(ToFreeze) {}
1685
1686public:
1687 ScalarizationResult(const ScalarizationResult &Other) = default;
1688 ~ScalarizationResult() {
1689 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1690 }
1691
1692 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1693 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1694 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1695 return {StatusTy::SafeWithFreeze, ToFreeze};
1696 }
1697
1698 /// Returns true if the index can be scalarize without requiring a freeze.
1699 bool isSafe() const { return Status == StatusTy::Safe; }
1700 /// Returns true if the index cannot be scalarized.
1701 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1702 /// Returns true if the index can be scalarize, but requires inserting a
1703 /// freeze.
1704 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1705
1706 /// Reset the state of Unsafe and clear ToFreze if set.
1707 void discard() {
1708 ToFreeze = nullptr;
1709 Status = StatusTy::Unsafe;
1710 }
1711
1712 /// Freeze the ToFreeze and update the use in \p User to use it.
1713 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1714 assert(isSafeWithFreeze() &&
1715 "should only be used when freezing is required");
1716 assert(is_contained(ToFreeze->users(), &UserI) &&
1717 "UserI must be a user of ToFreeze");
1718 IRBuilder<>::InsertPointGuard Guard(Builder);
1719 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1720 Value *Frozen =
1721 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1722 for (Use &U : make_early_inc_range((UserI.operands())))
1723 if (U.get() == ToFreeze)
1724 U.set(Frozen);
1725
1726 ToFreeze = nullptr;
1727 }
1728};
1729} // namespace
1730
1731/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1732/// Idx. \p Idx must access a valid vector element.
1733static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1734 Instruction *CtxI,
1735 AssumptionCache &AC,
1736 const DominatorTree &DT) {
1737 // We do checks for both fixed vector types and scalable vector types.
1738 // This is the number of elements of fixed vector types,
1739 // or the minimum number of elements of scalable vector types.
1740 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1741 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1742
1743 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1744 if (C->getValue().ult(NumElements))
1745 return ScalarizationResult::safe();
1746 return ScalarizationResult::unsafe();
1747 }
1748
1749 // Always unsafe if the index type can't handle all inbound values.
1750 if (!llvm::isUIntN(IntWidth, NumElements))
1751 return ScalarizationResult::unsafe();
1752
1753 APInt Zero(IntWidth, 0);
1754 APInt MaxElts(IntWidth, NumElements);
1755 ConstantRange ValidIndices(Zero, MaxElts);
1756 ConstantRange IdxRange(IntWidth, true);
1757
1758 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1759 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1760 true, &AC, CtxI, &DT)))
1761 return ScalarizationResult::safe();
1762 return ScalarizationResult::unsafe();
1763 }
1764
1765 // If the index may be poison, check if we can insert a freeze before the
1766 // range of the index is restricted.
1767 Value *IdxBase;
1768 ConstantInt *CI;
1769 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1770 IdxRange = IdxRange.binaryAnd(CI->getValue());
1771 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1772 IdxRange = IdxRange.urem(CI->getValue());
1773 }
1774
1775 if (ValidIndices.contains(IdxRange))
1776 return ScalarizationResult::safeWithFreeze(IdxBase);
1777 return ScalarizationResult::unsafe();
1778}
1779
1780/// The memory operation on a vector of \p ScalarType had alignment of
1781/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1782/// alignment that will be valid for the memory operation on a single scalar
1783/// element of the same type with index \p Idx.
1785 Type *ScalarType, Value *Idx,
1786 const DataLayout &DL) {
1787 if (auto *C = dyn_cast<ConstantInt>(Idx))
1788 return commonAlignment(VectorAlignment,
1789 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1790 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1791}
1792
1793// Combine patterns like:
1794// %0 = load <4 x i32>, <4 x i32>* %a
1795// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1796// store <4 x i32> %1, <4 x i32>* %a
1797// to:
1798// %0 = bitcast <4 x i32>* %a to i32*
1799// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1800// store i32 %b, i32* %1
1801bool VectorCombine::foldSingleElementStore(Instruction &I) {
1803 return false;
1804 auto *SI = cast<StoreInst>(&I);
1805 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1806 return false;
1807
1808 // TODO: Combine more complicated patterns (multiple insert) by referencing
1809 // TargetTransformInfo.
1811 Value *NewElement;
1812 Value *Idx;
1813 if (!match(SI->getValueOperand(),
1814 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1815 m_Value(Idx))))
1816 return false;
1817
1818 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1819 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1820 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1821 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1822 // modified between, vector type matches store size, and index is inbounds.
1823 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1824 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1825 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1826 return false;
1827
1828 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1829 if (ScalarizableIdx.isUnsafe() ||
1830 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1831 MemoryLocation::get(SI), AA))
1832 return false;
1833
1834 // Ensure we add the load back to the worklist BEFORE its users so they can
1835 // erased in the correct order.
1836 Worklist.push(Load);
1837
1838 if (ScalarizableIdx.isSafeWithFreeze())
1839 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1840 Value *GEP = Builder.CreateInBoundsGEP(
1841 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1842 {ConstantInt::get(Idx->getType(), 0), Idx});
1843 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1844 NSI->copyMetadata(*SI);
1845 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1846 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1847 *DL);
1848 NSI->setAlignment(ScalarOpAlignment);
1849 replaceValue(I, *NSI);
1851 return true;
1852 }
1853
1854 return false;
1855}
1856
1857/// Try to scalarize vector loads feeding extractelement or bitcast
1858/// instructions.
1859bool VectorCombine::scalarizeLoad(Instruction &I) {
1860 Value *Ptr;
1861 if (!match(&I, m_Load(m_Value(Ptr))))
1862 return false;
1863
1864 auto *LI = cast<LoadInst>(&I);
1865 auto *VecTy = cast<VectorType>(LI->getType());
1866 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1867 return false;
1868
1869 bool AllExtracts = true;
1870 bool AllBitcasts = true;
1871 Instruction *LastCheckedInst = LI;
1872 unsigned NumInstChecked = 0;
1873
1874 // Check what type of users we have (must either all be extracts or
1875 // bitcasts) and ensure no memory modifications between the load and
1876 // its users.
1877 for (User *U : LI->users()) {
1878 auto *UI = dyn_cast<Instruction>(U);
1879 if (!UI || UI->getParent() != LI->getParent())
1880 return false;
1881
1882 // If any user is waiting to be erased, then bail out as this will
1883 // distort the cost calculation and possibly lead to infinite loops.
1884 if (UI->use_empty())
1885 return false;
1886
1887 if (!isa<ExtractElementInst>(UI))
1888 AllExtracts = false;
1889 if (!isa<BitCastInst>(UI))
1890 AllBitcasts = false;
1891
1892 // Check if any instruction between the load and the user may modify memory.
1893 if (LastCheckedInst->comesBefore(UI)) {
1894 for (Instruction &I :
1895 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1896 // Bail out if we reached the check limit or the instruction may write
1897 // to memory.
1898 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1899 return false;
1900 NumInstChecked++;
1901 }
1902 LastCheckedInst = UI;
1903 }
1904 }
1905
1906 if (AllExtracts)
1907 return scalarizeLoadExtract(LI, VecTy, Ptr);
1908 if (AllBitcasts)
1909 return scalarizeLoadBitcast(LI, VecTy, Ptr);
1910 return false;
1911}
1912
1913/// Try to scalarize vector loads feeding extractelement instructions.
1914bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
1915 Value *Ptr) {
1917 return false;
1918
1919 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1920 auto FailureGuard = make_scope_exit([&]() {
1921 // If the transform is aborted, discard the ScalarizationResults.
1922 for (auto &Pair : NeedFreeze)
1923 Pair.second.discard();
1924 });
1925
1926 InstructionCost OriginalCost =
1927 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1929 InstructionCost ScalarizedCost = 0;
1930
1931 for (User *U : LI->users()) {
1932 auto *UI = cast<ExtractElementInst>(U);
1933
1934 auto ScalarIdx =
1935 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1936 if (ScalarIdx.isUnsafe())
1937 return false;
1938 if (ScalarIdx.isSafeWithFreeze()) {
1939 NeedFreeze.try_emplace(UI, ScalarIdx);
1940 ScalarIdx.discard();
1941 }
1942
1943 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1944 OriginalCost +=
1945 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1946 Index ? Index->getZExtValue() : -1);
1947 ScalarizedCost +=
1948 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1950 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1951 nullptr, nullptr, CostKind);
1952 }
1953
1954 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
1955 << "\n LoadExtractCost: " << OriginalCost
1956 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1957
1958 if (ScalarizedCost >= OriginalCost)
1959 return false;
1960
1961 // Ensure we add the load back to the worklist BEFORE its users so they can
1962 // erased in the correct order.
1963 Worklist.push(LI);
1964
1965 Type *ElemType = VecTy->getElementType();
1966
1967 // Replace extracts with narrow scalar loads.
1968 for (User *U : LI->users()) {
1969 auto *EI = cast<ExtractElementInst>(U);
1970 Value *Idx = EI->getIndexOperand();
1971
1972 // Insert 'freeze' for poison indexes.
1973 auto It = NeedFreeze.find(EI);
1974 if (It != NeedFreeze.end())
1975 It->second.freeze(Builder, *cast<Instruction>(Idx));
1976
1977 Builder.SetInsertPoint(EI);
1978 Value *GEP =
1979 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1980 auto *NewLoad = cast<LoadInst>(
1981 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1982
1983 Align ScalarOpAlignment =
1984 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1985 NewLoad->setAlignment(ScalarOpAlignment);
1986
1987 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1988 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1989 AAMDNodes OldAAMD = LI->getAAMetadata();
1990 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1991 }
1992
1993 replaceValue(*EI, *NewLoad, false);
1994 }
1995
1996 FailureGuard.release();
1997 return true;
1998}
1999
2000/// Try to scalarize vector loads feeding bitcast instructions.
2001bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2002 Value *Ptr) {
2003 InstructionCost OriginalCost =
2004 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2006
2007 Type *TargetScalarType = nullptr;
2008 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2009
2010 for (User *U : LI->users()) {
2011 auto *BC = cast<BitCastInst>(U);
2012
2013 Type *DestTy = BC->getDestTy();
2014 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2015 return false;
2016
2017 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2018 if (DestBitWidth != VecBitWidth)
2019 return false;
2020
2021 // All bitcasts must target the same scalar type.
2022 if (!TargetScalarType)
2023 TargetScalarType = DestTy;
2024 else if (TargetScalarType != DestTy)
2025 return false;
2026
2027 OriginalCost +=
2028 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2030 }
2031
2032 if (!TargetScalarType)
2033 return false;
2034
2035 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2036 InstructionCost ScalarizedCost =
2037 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2039
2040 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2041 << "\n OriginalCost: " << OriginalCost
2042 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2043
2044 if (ScalarizedCost >= OriginalCost)
2045 return false;
2046
2047 // Ensure we add the load back to the worklist BEFORE its users so they can
2048 // erased in the correct order.
2049 Worklist.push(LI);
2050
2051 Builder.SetInsertPoint(LI);
2052 auto *ScalarLoad =
2053 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2054 ScalarLoad->setAlignment(LI->getAlign());
2055 ScalarLoad->copyMetadata(*LI);
2056
2057 // Replace all bitcast users with the scalar load.
2058 for (User *U : LI->users()) {
2059 auto *BC = cast<BitCastInst>(U);
2060 replaceValue(*BC, *ScalarLoad, false);
2061 }
2062
2063 return true;
2064}
2065
2066bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2068 return false;
2069 auto *Ext = dyn_cast<ZExtInst>(&I);
2070 if (!Ext)
2071 return false;
2072
2073 // Try to convert a vector zext feeding only extracts to a set of scalar
2074 // (Src << ExtIdx *Size) & (Size -1)
2075 // if profitable .
2076 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2077 if (!SrcTy)
2078 return false;
2079 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2080
2081 Type *ScalarDstTy = DstTy->getElementType();
2082 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2083 return false;
2084
2085 InstructionCost VectorCost =
2086 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2088 unsigned ExtCnt = 0;
2089 bool ExtLane0 = false;
2090 for (User *U : Ext->users()) {
2091 uint64_t Idx;
2092 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2093 return false;
2094 if (cast<Instruction>(U)->use_empty())
2095 continue;
2096 ExtCnt += 1;
2097 ExtLane0 |= !Idx;
2098 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2099 CostKind, Idx, U);
2100 }
2101
2102 InstructionCost ScalarCost =
2103 ExtCnt * TTI.getArithmeticInstrCost(
2104 Instruction::And, ScalarDstTy, CostKind,
2107 (ExtCnt - ExtLane0) *
2109 Instruction::LShr, ScalarDstTy, CostKind,
2112 if (ScalarCost > VectorCost)
2113 return false;
2114
2115 Value *ScalarV = Ext->getOperand(0);
2116 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2117 &DT)) {
2118 // Check wether all lanes are extracted, all extracts trigger UB
2119 // on poison, and the last extract (and hence all previous ones)
2120 // are guaranteed to execute if Ext executes. If so, we do not
2121 // need to insert a freeze.
2122 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2123 bool AllExtractsTriggerUB = true;
2124 ExtractElementInst *LastExtract = nullptr;
2125 BasicBlock *ExtBB = Ext->getParent();
2126 for (User *U : Ext->users()) {
2127 auto *Extract = cast<ExtractElementInst>(U);
2128 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2129 AllExtractsTriggerUB = false;
2130 break;
2131 }
2132 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2133 if (!LastExtract || LastExtract->comesBefore(Extract))
2134 LastExtract = Extract;
2135 }
2136 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2137 !AllExtractsTriggerUB ||
2139 LastExtract->getIterator()))
2140 ScalarV = Builder.CreateFreeze(ScalarV);
2141 }
2142 ScalarV = Builder.CreateBitCast(
2143 ScalarV,
2144 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2145 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2146 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2147 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2148 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2149 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2150 for (User *U : Ext->users()) {
2151 auto *Extract = cast<ExtractElementInst>(U);
2152 uint64_t Idx =
2153 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2154 uint64_t ShiftAmt =
2155 DL->isBigEndian()
2156 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2157 : (Idx * SrcEltSizeInBits);
2158 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2159 Value *And = Builder.CreateAnd(LShr, Mask);
2160 U->replaceAllUsesWith(And);
2161 }
2162 return true;
2163}
2164
2165/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2166/// to "(bitcast (concat X, Y))"
2167/// where X/Y are bitcasted from i1 mask vectors.
2168bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2169 Type *Ty = I.getType();
2170 if (!Ty->isIntegerTy())
2171 return false;
2172
2173 // TODO: Add big endian test coverage
2174 if (DL->isBigEndian())
2175 return false;
2176
2177 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2178 Instruction *X, *Y;
2180 return false;
2181
2182 // Allow both sources to contain shl, to handle more generic pattern:
2183 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2184 Value *SrcX;
2185 uint64_t ShAmtX = 0;
2186 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2187 !match(X, m_OneUse(
2189 m_ConstantInt(ShAmtX)))))
2190 return false;
2191
2192 Value *SrcY;
2193 uint64_t ShAmtY = 0;
2194 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2195 !match(Y, m_OneUse(
2197 m_ConstantInt(ShAmtY)))))
2198 return false;
2199
2200 // Canonicalize larger shift to the RHS.
2201 if (ShAmtX > ShAmtY) {
2202 std::swap(X, Y);
2203 std::swap(SrcX, SrcY);
2204 std::swap(ShAmtX, ShAmtY);
2205 }
2206
2207 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2208 // difference is the mask width so they can be easily concatenated together.
2209 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2210 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2211 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2212 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2213 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2214 !MaskTy->getElementType()->isIntegerTy(1) ||
2215 MaskTy->getNumElements() != ShAmtDiff ||
2216 MaskTy->getNumElements() > (BitWidth / 2))
2217 return false;
2218
2219 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2220 auto *ConcatIntTy =
2221 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2222 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2223
2224 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2225 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2226
2227 // TODO: Is it worth supporting multi use cases?
2228 InstructionCost OldCost = 0;
2229 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2230 OldCost +=
2231 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2232 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2234 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2236
2237 InstructionCost NewCost = 0;
2239 MaskTy, ConcatMask, CostKind);
2240 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2242 if (Ty != ConcatIntTy)
2243 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2245 if (ShAmtX > 0)
2246 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2247
2248 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2249 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2250 << "\n");
2251
2252 if (NewCost > OldCost)
2253 return false;
2254
2255 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2256 // any residual zero-extension or shifting.
2257 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2258 Worklist.pushValue(Concat);
2259
2260 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2261
2262 if (Ty != ConcatIntTy) {
2263 Worklist.pushValue(Result);
2264 Result = Builder.CreateZExt(Result, Ty);
2265 }
2266
2267 if (ShAmtX > 0) {
2268 Worklist.pushValue(Result);
2269 Result = Builder.CreateShl(Result, ShAmtX);
2270 }
2271
2272 replaceValue(I, *Result);
2273 return true;
2274}
2275
2276/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2277/// --> "binop (shuffle), (shuffle)".
2278bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2279 BinaryOperator *BinOp;
2280 ArrayRef<int> OuterMask;
2281 if (!match(&I,
2282 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2283 return false;
2284
2285 // Don't introduce poison into div/rem.
2286 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2287 return false;
2288
2289 Value *Op00, *Op01, *Op10, *Op11;
2290 ArrayRef<int> Mask0, Mask1;
2291 bool Match0 =
2292 match(BinOp->getOperand(0),
2293 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2294 bool Match1 =
2295 match(BinOp->getOperand(1),
2296 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2297 if (!Match0 && !Match1)
2298 return false;
2299
2300 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2301 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2302 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2303 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2304
2305 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2306 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2307 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2308 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2309 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2310 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2311 return false;
2312
2313 unsigned NumSrcElts = BinOpTy->getNumElements();
2314
2315 // Don't accept shuffles that reference the second operand in
2316 // div/rem or if its an undef arg.
2317 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2318 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2319 return false;
2320
2321 // Merge outer / inner (or identity if no match) shuffles.
2322 SmallVector<int> NewMask0, NewMask1;
2323 for (int M : OuterMask) {
2324 if (M < 0 || M >= (int)NumSrcElts) {
2325 NewMask0.push_back(PoisonMaskElem);
2326 NewMask1.push_back(PoisonMaskElem);
2327 } else {
2328 NewMask0.push_back(Match0 ? Mask0[M] : M);
2329 NewMask1.push_back(Match1 ? Mask1[M] : M);
2330 }
2331 }
2332
2333 unsigned NumOpElts = Op0Ty->getNumElements();
2334 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2335 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2336 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2337 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2338 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2339 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2340
2341 // Try to merge shuffles across the binop if the new shuffles are not costly.
2342 InstructionCost OldCost =
2343 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2345 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2346 if (Match0)
2347 OldCost += TTI.getShuffleCost(
2348 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2349 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2350 if (Match1)
2351 OldCost += TTI.getShuffleCost(
2352 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2353 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2354
2355 InstructionCost NewCost =
2356 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2357
2358 if (!IsIdentity0)
2359 NewCost +=
2361 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2362 if (!IsIdentity1)
2363 NewCost +=
2365 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2366
2367 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2368 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2369 << "\n");
2370
2371 // If costs are equal, still fold as we reduce instruction count.
2372 if (NewCost > OldCost)
2373 return false;
2374
2375 Value *LHS =
2376 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2377 Value *RHS =
2378 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2379 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2380
2381 // Intersect flags from the old binops.
2382 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2383 NewInst->copyIRFlags(BinOp);
2384
2385 Worklist.pushValue(LHS);
2386 Worklist.pushValue(RHS);
2387 replaceValue(I, *NewBO);
2388 return true;
2389}
2390
2391/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2392/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2393bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2394 ArrayRef<int> OldMask;
2395 Instruction *LHS, *RHS;
2397 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2398 return false;
2399
2400 // TODO: Add support for addlike etc.
2401 if (LHS->getOpcode() != RHS->getOpcode())
2402 return false;
2403
2404 Value *X, *Y, *Z, *W;
2405 bool IsCommutative = false;
2406 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2407 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2408 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2409 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2410 auto *BO = cast<BinaryOperator>(LHS);
2411 // Don't introduce poison into div/rem.
2412 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2413 return false;
2414 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2415 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2416 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2417 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2418 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2419 } else
2420 return false;
2421
2422 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2423 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2424 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2425 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2426 return false;
2427
2428 unsigned NumSrcElts = BinOpTy->getNumElements();
2429
2430 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2431 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2432 std::swap(X, Y);
2433
2434 auto ConvertToUnary = [NumSrcElts](int &M) {
2435 if (M >= (int)NumSrcElts)
2436 M -= NumSrcElts;
2437 };
2438
2439 SmallVector<int> NewMask0(OldMask);
2441 if (X == Z) {
2442 llvm::for_each(NewMask0, ConvertToUnary);
2444 Z = PoisonValue::get(BinOpTy);
2445 }
2446
2447 SmallVector<int> NewMask1(OldMask);
2449 if (Y == W) {
2450 llvm::for_each(NewMask1, ConvertToUnary);
2452 W = PoisonValue::get(BinOpTy);
2453 }
2454
2455 // Try to replace a binop with a shuffle if the shuffle is not costly.
2456 InstructionCost OldCost =
2460 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2461 &I);
2462
2463 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2464 // where one use shuffles have gotten split across the binop/cmp. These
2465 // often allow a major reduction in total cost that wouldn't happen as
2466 // individual folds.
2467 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2468 TTI::TargetCostKind CostKind) -> bool {
2469 Value *InnerOp;
2470 ArrayRef<int> InnerMask;
2471 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2472 m_Mask(InnerMask)))) &&
2473 InnerOp->getType() == Op->getType() &&
2474 all_of(InnerMask,
2475 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2476 for (int &M : Mask)
2477 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2478 M = InnerMask[M - Offset];
2479 M = 0 <= M ? M + Offset : M;
2480 }
2482 Op = InnerOp;
2483 return true;
2484 }
2485 return false;
2486 };
2487 bool ReducedInstCount = false;
2488 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2489 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2490 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2491 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2492
2493 auto *ShuffleCmpTy =
2494 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2495 InstructionCost NewCost =
2496 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2497 nullptr, {X, Z}) +
2498 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2499 nullptr, {Y, W});
2500
2501 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2502 NewCost +=
2503 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2504 } else {
2505 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2506 ShuffleDstTy, PredLHS, CostKind);
2507 }
2508
2509 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2510 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2511 << "\n");
2512
2513 // If either shuffle will constant fold away, then fold for the same cost as
2514 // we will reduce the instruction count.
2515 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2516 (isa<Constant>(Y) && isa<Constant>(W));
2517 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2518 return false;
2519
2520 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2521 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2522 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2523 ? Builder.CreateBinOp(
2524 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2525 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2526
2527 // Intersect flags from the old binops.
2528 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2529 NewInst->copyIRFlags(LHS);
2530 NewInst->andIRFlags(RHS);
2531 }
2532
2533 Worklist.pushValue(Shuf0);
2534 Worklist.pushValue(Shuf1);
2535 replaceValue(I, *NewBO);
2536 return true;
2537}
2538
2539/// Try to convert,
2540/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2541/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2542bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2543 ArrayRef<int> Mask;
2544 Value *C1, *T1, *F1, *C2, *T2, *F2;
2545 if (!match(&I, m_Shuffle(
2547 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2548 m_Mask(Mask))))
2549 return false;
2550
2551 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2552 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2553 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2554 return false;
2555
2556 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2557 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2558 // SelectInsts must have the same FMF.
2559 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2560 ((SI0FOp != nullptr) &&
2561 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2562 return false;
2563
2564 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2565 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2567 auto SelOp = Instruction::Select;
2569 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2570 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2572 OldCost +=
2573 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2574 {I.getOperand(0), I.getOperand(1)}, &I);
2575
2577 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2578 Mask, CostKind, 0, nullptr, {C1, C2});
2579 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2580 nullptr, {T1, T2});
2581 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2582 nullptr, {F1, F2});
2583 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2584 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2585 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2587
2588 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2589 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2590 << "\n");
2591 if (NewCost > OldCost)
2592 return false;
2593
2594 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2595 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2596 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2597 Value *NewSel;
2598 // We presuppose that the SelectInsts have the same FMF.
2599 if (SI0FOp)
2600 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2601 SI0FOp->getFastMathFlags());
2602 else
2603 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2604
2605 Worklist.pushValue(ShuffleCmp);
2606 Worklist.pushValue(ShuffleTrue);
2607 Worklist.pushValue(ShuffleFalse);
2608 replaceValue(I, *NewSel);
2609 return true;
2610}
2611
2612/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2613/// into "castop (shuffle)".
2614bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2615 Value *V0, *V1;
2616 ArrayRef<int> OldMask;
2617 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2618 return false;
2619
2620 // Check whether this is a binary shuffle.
2621 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2622
2623 auto *C0 = dyn_cast<CastInst>(V0);
2624 auto *C1 = dyn_cast<CastInst>(V1);
2625 if (!C0 || (IsBinaryShuffle && !C1))
2626 return false;
2627
2628 Instruction::CastOps Opcode = C0->getOpcode();
2629
2630 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2631 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2632 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2633 return false;
2634
2635 if (IsBinaryShuffle) {
2636 if (C0->getSrcTy() != C1->getSrcTy())
2637 return false;
2638 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2639 if (Opcode != C1->getOpcode()) {
2640 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2641 Opcode = Instruction::SExt;
2642 else
2643 return false;
2644 }
2645 }
2646
2647 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2648 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2649 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2650 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2651 return false;
2652
2653 unsigned NumSrcElts = CastSrcTy->getNumElements();
2654 unsigned NumDstElts = CastDstTy->getNumElements();
2655 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2656 "Only bitcasts expected to alter src/dst element counts");
2657
2658 // Check for bitcasting of unscalable vector types.
2659 // e.g. <32 x i40> -> <40 x i32>
2660 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2661 (NumDstElts % NumSrcElts) != 0)
2662 return false;
2663
2664 SmallVector<int, 16> NewMask;
2665 if (NumSrcElts >= NumDstElts) {
2666 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2667 // always be expanded to the equivalent form choosing narrower elements.
2668 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2669 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2670 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2671 } else {
2672 // The bitcast is from narrow elements to wide elements. The shuffle mask
2673 // must choose consecutive elements to allow casting first.
2674 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2675 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2676 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2677 return false;
2678 }
2679
2680 auto *NewShuffleDstTy =
2681 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2682
2683 // Try to replace a castop with a shuffle if the shuffle is not costly.
2684 InstructionCost CostC0 =
2685 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2687
2689 if (IsBinaryShuffle)
2691 else
2693
2694 InstructionCost OldCost = CostC0;
2695 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2696 CostKind, 0, nullptr, {}, &I);
2697
2698 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2699 CastSrcTy, NewMask, CostKind);
2700 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2702 if (!C0->hasOneUse())
2703 NewCost += CostC0;
2704 if (IsBinaryShuffle) {
2705 InstructionCost CostC1 =
2706 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2708 OldCost += CostC1;
2709 if (!C1->hasOneUse())
2710 NewCost += CostC1;
2711 }
2712
2713 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2714 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2715 << "\n");
2716 if (NewCost > OldCost)
2717 return false;
2718
2719 Value *Shuf;
2720 if (IsBinaryShuffle)
2721 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2722 NewMask);
2723 else
2724 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2725
2726 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2727
2728 // Intersect flags from the old casts.
2729 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2730 NewInst->copyIRFlags(C0);
2731 if (IsBinaryShuffle)
2732 NewInst->andIRFlags(C1);
2733 }
2734
2735 Worklist.pushValue(Shuf);
2736 replaceValue(I, *Cast);
2737 return true;
2738}
2739
2740/// Try to convert any of:
2741/// "shuffle (shuffle x, y), (shuffle y, x)"
2742/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2743/// "shuffle (shuffle x, undef), y"
2744/// "shuffle x, (shuffle y, undef)"
2745/// into "shuffle x, y".
2746bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2747 ArrayRef<int> OuterMask;
2748 Value *OuterV0, *OuterV1;
2749 if (!match(&I,
2750 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2751 return false;
2752
2753 ArrayRef<int> InnerMask0, InnerMask1;
2754 Value *X0, *X1, *Y0, *Y1;
2755 bool Match0 =
2756 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2757 bool Match1 =
2758 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2759 if (!Match0 && !Match1)
2760 return false;
2761
2762 // If the outer shuffle is a permute, then create a fake inner all-poison
2763 // shuffle. This is easier than accounting for length-changing shuffles below.
2764 SmallVector<int, 16> PoisonMask1;
2765 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2766 X1 = X0;
2767 Y1 = Y0;
2768 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2769 InnerMask1 = PoisonMask1;
2770 Match1 = true; // fake match
2771 }
2772
2773 X0 = Match0 ? X0 : OuterV0;
2774 Y0 = Match0 ? Y0 : OuterV0;
2775 X1 = Match1 ? X1 : OuterV1;
2776 Y1 = Match1 ? Y1 : OuterV1;
2777 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2778 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2779 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2780 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2781 X0->getType() != X1->getType())
2782 return false;
2783
2784 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2785 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2786
2787 // Attempt to merge shuffles, matching upto 2 source operands.
2788 // Replace index to a poison arg with PoisonMaskElem.
2789 // Bail if either inner masks reference an undef arg.
2790 SmallVector<int, 16> NewMask(OuterMask);
2791 Value *NewX = nullptr, *NewY = nullptr;
2792 for (int &M : NewMask) {
2793 Value *Src = nullptr;
2794 if (0 <= M && M < (int)NumImmElts) {
2795 Src = OuterV0;
2796 if (Match0) {
2797 M = InnerMask0[M];
2798 Src = M >= (int)NumSrcElts ? Y0 : X0;
2799 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2800 }
2801 } else if (M >= (int)NumImmElts) {
2802 Src = OuterV1;
2803 M -= NumImmElts;
2804 if (Match1) {
2805 M = InnerMask1[M];
2806 Src = M >= (int)NumSrcElts ? Y1 : X1;
2807 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2808 }
2809 }
2810 if (Src && M != PoisonMaskElem) {
2811 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2812 if (isa<UndefValue>(Src)) {
2813 // We've referenced an undef element - if its poison, update the shuffle
2814 // mask, else bail.
2815 if (!isa<PoisonValue>(Src))
2816 return false;
2817 M = PoisonMaskElem;
2818 continue;
2819 }
2820 if (!NewX || NewX == Src) {
2821 NewX = Src;
2822 continue;
2823 }
2824 if (!NewY || NewY == Src) {
2825 M += NumSrcElts;
2826 NewY = Src;
2827 continue;
2828 }
2829 return false;
2830 }
2831 }
2832
2833 if (!NewX)
2834 return PoisonValue::get(ShuffleDstTy);
2835 if (!NewY)
2836 NewY = PoisonValue::get(ShuffleSrcTy);
2837
2838 // Have we folded to an Identity shuffle?
2839 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2840 replaceValue(I, *NewX);
2841 return true;
2842 }
2843
2844 // Try to merge the shuffles if the new shuffle is not costly.
2845 InstructionCost InnerCost0 = 0;
2846 if (Match0)
2847 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2848
2849 InstructionCost InnerCost1 = 0;
2850 if (Match1)
2851 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2852
2854
2855 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2856
2857 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2861 InstructionCost NewCost =
2862 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2863 nullptr, {NewX, NewY});
2864 if (!OuterV0->hasOneUse())
2865 NewCost += InnerCost0;
2866 if (!OuterV1->hasOneUse())
2867 NewCost += InnerCost1;
2868
2869 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2870 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2871 << "\n");
2872 if (NewCost > OldCost)
2873 return false;
2874
2875 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2876 replaceValue(I, *Shuf);
2877 return true;
2878}
2879
2880/// Try to convert
2881/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2882bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2883 Value *V0, *V1;
2884 ArrayRef<int> OldMask;
2885 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2886 m_Mask(OldMask))))
2887 return false;
2888
2889 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2890 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2891 if (!II0 || !II1)
2892 return false;
2893
2894 Intrinsic::ID IID = II0->getIntrinsicID();
2895 if (IID != II1->getIntrinsicID())
2896 return false;
2897
2898 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2899 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2900 if (!ShuffleDstTy || !II0Ty)
2901 return false;
2902
2903 if (!isTriviallyVectorizable(IID))
2904 return false;
2905
2906 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2908 II0->getArgOperand(I) != II1->getArgOperand(I))
2909 return false;
2910
2911 InstructionCost OldCost =
2912 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2913 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2915 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2916
2917 SmallVector<Type *> NewArgsTy;
2918 InstructionCost NewCost = 0;
2919 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2921 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2922 } else {
2923 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2924 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2925 ShuffleDstTy->getNumElements());
2926 NewArgsTy.push_back(ArgTy);
2927 NewCost += TTI.getShuffleCost(
2928 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
2929 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
2930 }
2931 }
2932 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2933 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2934
2935 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2936 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2937 << "\n");
2938
2939 if (NewCost > OldCost)
2940 return false;
2941
2942 SmallVector<Value *> NewArgs;
2943 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2945 NewArgs.push_back(II0->getArgOperand(I));
2946 } else {
2947 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2948 II1->getArgOperand(I), OldMask);
2949 NewArgs.push_back(Shuf);
2950 Worklist.pushValue(Shuf);
2951 }
2952 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2953
2954 // Intersect flags from the old intrinsics.
2955 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2956 NewInst->copyIRFlags(II0);
2957 NewInst->andIRFlags(II1);
2958 }
2959
2960 replaceValue(I, *NewIntrinsic);
2961 return true;
2962}
2963
2964using InstLane = std::pair<Use *, int>;
2965
2966static InstLane lookThroughShuffles(Use *U, int Lane) {
2967 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2968 unsigned NumElts =
2969 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2970 int M = SV->getMaskValue(Lane);
2971 if (M < 0)
2972 return {nullptr, PoisonMaskElem};
2973 if (static_cast<unsigned>(M) < NumElts) {
2974 U = &SV->getOperandUse(0);
2975 Lane = M;
2976 } else {
2977 U = &SV->getOperandUse(1);
2978 Lane = M - NumElts;
2979 }
2980 }
2981 return InstLane{U, Lane};
2982}
2983
2987 for (InstLane IL : Item) {
2988 auto [U, Lane] = IL;
2989 InstLane OpLane =
2990 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2991 Lane)
2992 : InstLane{nullptr, PoisonMaskElem};
2993 NItem.emplace_back(OpLane);
2994 }
2995 return NItem;
2996}
2997
2998/// Detect concat of multiple values into a vector
3000 const TargetTransformInfo &TTI) {
3001 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3002 unsigned NumElts = Ty->getNumElements();
3003 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3004 return false;
3005
3006 // Check that the concat is free, usually meaning that the type will be split
3007 // during legalization.
3008 SmallVector<int, 16> ConcatMask(NumElts * 2);
3009 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3010 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3011 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3012 Ty, ConcatMask, CostKind) != 0)
3013 return false;
3014
3015 unsigned NumSlices = Item.size() / NumElts;
3016 // Currently we generate a tree of shuffles for the concats, which limits us
3017 // to a power2.
3018 if (!isPowerOf2_32(NumSlices))
3019 return false;
3020 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3021 Use *SliceV = Item[Slice * NumElts].first;
3022 if (!SliceV || SliceV->get()->getType() != Ty)
3023 return false;
3024 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3025 auto [V, Lane] = Item[Slice * NumElts + Elt];
3026 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3027 return false;
3028 }
3029 }
3030 return true;
3031}
3032
3034 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3035 const SmallPtrSet<Use *, 4> &SplatLeafs,
3036 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3037 IRBuilderBase &Builder,
3038 const TargetTransformInfo *TTI) {
3039 auto [FrontU, FrontLane] = Item.front();
3040
3041 if (IdentityLeafs.contains(FrontU)) {
3042 return FrontU->get();
3043 }
3044 if (SplatLeafs.contains(FrontU)) {
3045 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3046 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3047 }
3048 if (ConcatLeafs.contains(FrontU)) {
3049 unsigned NumElts =
3050 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3051 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3052 for (unsigned S = 0; S < Values.size(); ++S)
3053 Values[S] = Item[S * NumElts].first->get();
3054
3055 while (Values.size() > 1) {
3056 NumElts *= 2;
3057 SmallVector<int, 16> Mask(NumElts, 0);
3058 std::iota(Mask.begin(), Mask.end(), 0);
3059 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3060 for (unsigned S = 0; S < NewValues.size(); ++S)
3061 NewValues[S] =
3062 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3063 Values = NewValues;
3064 }
3065 return Values[0];
3066 }
3067
3068 auto *I = cast<Instruction>(FrontU->get());
3069 auto *II = dyn_cast<IntrinsicInst>(I);
3070 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3072 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3073 if (II &&
3074 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3075 Ops[Idx] = II->getOperand(Idx);
3076 continue;
3077 }
3079 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3080 Builder, TTI);
3081 }
3082
3083 SmallVector<Value *, 8> ValueList;
3084 for (const auto &Lane : Item)
3085 if (Lane.first)
3086 ValueList.push_back(Lane.first->get());
3087
3088 Type *DstTy =
3089 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3090 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3091 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3092 Ops[0], Ops[1]);
3093 propagateIRFlags(Value, ValueList);
3094 return Value;
3095 }
3096 if (auto *CI = dyn_cast<CmpInst>(I)) {
3097 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3098 propagateIRFlags(Value, ValueList);
3099 return Value;
3100 }
3101 if (auto *SI = dyn_cast<SelectInst>(I)) {
3102 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3103 propagateIRFlags(Value, ValueList);
3104 return Value;
3105 }
3106 if (auto *CI = dyn_cast<CastInst>(I)) {
3107 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3108 propagateIRFlags(Value, ValueList);
3109 return Value;
3110 }
3111 if (II) {
3112 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3113 propagateIRFlags(Value, ValueList);
3114 return Value;
3115 }
3116 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3117 auto *Value =
3118 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3119 propagateIRFlags(Value, ValueList);
3120 return Value;
3121}
3122
3123// Starting from a shuffle, look up through operands tracking the shuffled index
3124// of each lane. If we can simplify away the shuffles to identities then
3125// do so.
3126bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3127 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3128 if (!Ty || I.use_empty())
3129 return false;
3130
3131 SmallVector<InstLane> Start(Ty->getNumElements());
3132 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3133 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3134
3136 Worklist.push_back(Start);
3137 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3138 unsigned NumVisited = 0;
3139
3140 while (!Worklist.empty()) {
3141 if (++NumVisited > MaxInstrsToScan)
3142 return false;
3143
3144 SmallVector<InstLane> Item = Worklist.pop_back_val();
3145 auto [FrontU, FrontLane] = Item.front();
3146
3147 // If we found an undef first lane then bail out to keep things simple.
3148 if (!FrontU)
3149 return false;
3150
3151 // Helper to peek through bitcasts to the same value.
3152 auto IsEquiv = [&](Value *X, Value *Y) {
3153 return X->getType() == Y->getType() &&
3155 };
3156
3157 // Look for an identity value.
3158 if (FrontLane == 0 &&
3159 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3160 Ty->getNumElements() &&
3161 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3162 Value *FrontV = Item.front().first->get();
3163 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3164 E.value().second == (int)E.index());
3165 })) {
3166 IdentityLeafs.insert(FrontU);
3167 continue;
3168 }
3169 // Look for constants, for the moment only supporting constant splats.
3170 if (auto *C = dyn_cast<Constant>(FrontU);
3171 C && C->getSplatValue() &&
3172 all_of(drop_begin(Item), [Item](InstLane &IL) {
3173 Value *FrontV = Item.front().first->get();
3174 Use *U = IL.first;
3175 return !U || (isa<Constant>(U->get()) &&
3176 cast<Constant>(U->get())->getSplatValue() ==
3177 cast<Constant>(FrontV)->getSplatValue());
3178 })) {
3179 SplatLeafs.insert(FrontU);
3180 continue;
3181 }
3182 // Look for a splat value.
3183 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3184 auto [FrontU, FrontLane] = Item.front();
3185 auto [U, Lane] = IL;
3186 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3187 })) {
3188 SplatLeafs.insert(FrontU);
3189 continue;
3190 }
3191
3192 // We need each element to be the same type of value, and check that each
3193 // element has a single use.
3194 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3195 Value *FrontV = Item.front().first->get();
3196 if (!IL.first)
3197 return true;
3198 Value *V = IL.first->get();
3199 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3200 return false;
3201 if (V->getValueID() != FrontV->getValueID())
3202 return false;
3203 if (auto *CI = dyn_cast<CmpInst>(V))
3204 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3205 return false;
3206 if (auto *CI = dyn_cast<CastInst>(V))
3207 if (CI->getSrcTy()->getScalarType() !=
3208 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3209 return false;
3210 if (auto *SI = dyn_cast<SelectInst>(V))
3211 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3212 SI->getOperand(0)->getType() !=
3213 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3214 return false;
3215 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3216 return false;
3217 auto *II = dyn_cast<IntrinsicInst>(V);
3218 return !II || (isa<IntrinsicInst>(FrontV) &&
3219 II->getIntrinsicID() ==
3220 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3221 !II->hasOperandBundles());
3222 };
3223 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3224 // Check the operator is one that we support.
3225 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3226 // We exclude div/rem in case they hit UB from poison lanes.
3227 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3228 BO && BO->isIntDivRem())
3229 return false;
3232 continue;
3233 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3234 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3236 continue;
3237 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3238 // TODO: Handle vector widening/narrowing bitcasts.
3239 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3240 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3241 if (DstTy && SrcTy &&
3242 SrcTy->getNumElements() == DstTy->getNumElements()) {
3244 continue;
3245 }
3246 } else if (isa<SelectInst>(FrontU)) {
3250 continue;
3251 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3252 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3253 !II->hasOperandBundles()) {
3254 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3255 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3256 &TTI)) {
3257 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3258 Value *FrontV = Item.front().first->get();
3259 Use *U = IL.first;
3260 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3261 cast<Instruction>(FrontV)->getOperand(Op));
3262 }))
3263 return false;
3264 continue;
3265 }
3267 }
3268 continue;
3269 }
3270 }
3271
3272 if (isFreeConcat(Item, CostKind, TTI)) {
3273 ConcatLeafs.insert(FrontU);
3274 continue;
3275 }
3276
3277 return false;
3278 }
3279
3280 if (NumVisited <= 1)
3281 return false;
3282
3283 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3284
3285 // If we got this far, we know the shuffles are superfluous and can be
3286 // removed. Scan through again and generate the new tree of instructions.
3287 Builder.SetInsertPoint(&I);
3288 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3289 ConcatLeafs, Builder, &TTI);
3290 replaceValue(I, *V);
3291 return true;
3292}
3293
3294/// Given a commutative reduction, the order of the input lanes does not alter
3295/// the results. We can use this to remove certain shuffles feeding the
3296/// reduction, removing the need to shuffle at all.
3297bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3298 auto *II = dyn_cast<IntrinsicInst>(&I);
3299 if (!II)
3300 return false;
3301 switch (II->getIntrinsicID()) {
3302 case Intrinsic::vector_reduce_add:
3303 case Intrinsic::vector_reduce_mul:
3304 case Intrinsic::vector_reduce_and:
3305 case Intrinsic::vector_reduce_or:
3306 case Intrinsic::vector_reduce_xor:
3307 case Intrinsic::vector_reduce_smin:
3308 case Intrinsic::vector_reduce_smax:
3309 case Intrinsic::vector_reduce_umin:
3310 case Intrinsic::vector_reduce_umax:
3311 break;
3312 default:
3313 return false;
3314 }
3315
3316 // Find all the inputs when looking through operations that do not alter the
3317 // lane order (binops, for example). Currently we look for a single shuffle,
3318 // and can ignore splat values.
3319 std::queue<Value *> Worklist;
3320 SmallPtrSet<Value *, 4> Visited;
3321 ShuffleVectorInst *Shuffle = nullptr;
3322 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3323 Worklist.push(Op);
3324
3325 while (!Worklist.empty()) {
3326 Value *CV = Worklist.front();
3327 Worklist.pop();
3328 if (Visited.contains(CV))
3329 continue;
3330
3331 // Splats don't change the order, so can be safely ignored.
3332 if (isSplatValue(CV))
3333 continue;
3334
3335 Visited.insert(CV);
3336
3337 if (auto *CI = dyn_cast<Instruction>(CV)) {
3338 if (CI->isBinaryOp()) {
3339 for (auto *Op : CI->operand_values())
3340 Worklist.push(Op);
3341 continue;
3342 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3343 if (Shuffle && Shuffle != SV)
3344 return false;
3345 Shuffle = SV;
3346 continue;
3347 }
3348 }
3349
3350 // Anything else is currently an unknown node.
3351 return false;
3352 }
3353
3354 if (!Shuffle)
3355 return false;
3356
3357 // Check all uses of the binary ops and shuffles are also included in the
3358 // lane-invariant operations (Visited should be the list of lanewise
3359 // instructions, including the shuffle that we found).
3360 for (auto *V : Visited)
3361 for (auto *U : V->users())
3362 if (!Visited.contains(U) && U != &I)
3363 return false;
3364
3365 FixedVectorType *VecType =
3366 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3367 if (!VecType)
3368 return false;
3369 FixedVectorType *ShuffleInputType =
3371 if (!ShuffleInputType)
3372 return false;
3373 unsigned NumInputElts = ShuffleInputType->getNumElements();
3374
3375 // Find the mask from sorting the lanes into order. This is most likely to
3376 // become a identity or concat mask. Undef elements are pushed to the end.
3377 SmallVector<int> ConcatMask;
3378 Shuffle->getShuffleMask(ConcatMask);
3379 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3380 bool UsesSecondVec =
3381 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3382
3384 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3385 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3387 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3388 ShuffleInputType, ConcatMask, CostKind);
3389
3390 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3391 << "\n");
3392 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3393 << "\n");
3394 bool MadeChanges = false;
3395 if (NewCost < OldCost) {
3396 Builder.SetInsertPoint(Shuffle);
3397 Value *NewShuffle = Builder.CreateShuffleVector(
3398 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3399 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3400 replaceValue(*Shuffle, *NewShuffle);
3401 return true;
3402 }
3403
3404 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3405 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3406 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3407 return MadeChanges;
3408}
3409
3410/// For a given chain of patterns of the following form:
3411///
3412/// ```
3413/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3414///
3415/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3416/// ty1> %1)
3417/// OR
3418/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3419///
3420/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3421/// ...
3422/// ...
3423/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3424/// 3), <n x ty1> %(i - 2)
3425/// OR
3426/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3427///
3428/// %(i) = extractelement <n x ty1> %(i - 1), 0
3429/// ```
3430///
3431/// Where:
3432/// `mask` follows a partition pattern:
3433///
3434/// Ex:
3435/// [n = 8, p = poison]
3436///
3437/// 4 5 6 7 | p p p p
3438/// 2 3 | p p p p p p
3439/// 1 | p p p p p p p
3440///
3441/// For powers of 2, there's a consistent pattern, but for other cases
3442/// the parity of the current half value at each step decides the
3443/// next partition half (see `ExpectedParityMask` for more logical details
3444/// in generalising this).
3445///
3446/// Ex:
3447/// [n = 6]
3448///
3449/// 3 4 5 | p p p
3450/// 1 2 | p p p p
3451/// 1 | p p p p p
3452bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3453 // Going bottom-up for the pattern.
3454 std::queue<Value *> InstWorklist;
3455 InstructionCost OrigCost = 0;
3456
3457 // Common instruction operation after each shuffle op.
3458 std::optional<unsigned int> CommonCallOp = std::nullopt;
3459 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3460
3461 bool IsFirstCallOrBinInst = true;
3462 bool ShouldBeCallOrBinInst = true;
3463
3464 // This stores the last used instructions for shuffle/common op.
3465 //
3466 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3467 // instructions from either shuffle/common op.
3468 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3469
3470 Value *VecOpEE;
3471 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3472 return false;
3473
3474 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3475 if (!FVT)
3476 return false;
3477
3478 int64_t VecSize = FVT->getNumElements();
3479 if (VecSize < 2)
3480 return false;
3481
3482 // Number of levels would be ~log2(n), considering we always partition
3483 // by half for this fold pattern.
3484 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3485 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3486
3487 // This is how we generalise for all element sizes.
3488 // At each step, if vector size is odd, we need non-poison
3489 // values to cover the dominant half so we don't miss out on any element.
3490 //
3491 // This mask will help us retrieve this as we go from bottom to top:
3492 //
3493 // Mask Set -> N = N * 2 - 1
3494 // Mask Unset -> N = N * 2
3495 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3496 Cur = (Cur + 1) / 2, --Mask) {
3497 if (Cur & 1)
3498 ExpectedParityMask |= (1ll << Mask);
3499 }
3500
3501 InstWorklist.push(VecOpEE);
3502
3503 while (!InstWorklist.empty()) {
3504 Value *CI = InstWorklist.front();
3505 InstWorklist.pop();
3506
3507 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3508 if (!ShouldBeCallOrBinInst)
3509 return false;
3510
3511 if (!IsFirstCallOrBinInst &&
3512 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3513 return false;
3514
3515 // For the first found call/bin op, the vector has to come from the
3516 // extract element op.
3517 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3518 return false;
3519 IsFirstCallOrBinInst = false;
3520
3521 if (!CommonCallOp)
3522 CommonCallOp = II->getIntrinsicID();
3523 if (II->getIntrinsicID() != *CommonCallOp)
3524 return false;
3525
3526 switch (II->getIntrinsicID()) {
3527 case Intrinsic::umin:
3528 case Intrinsic::umax:
3529 case Intrinsic::smin:
3530 case Intrinsic::smax: {
3531 auto *Op0 = II->getOperand(0);
3532 auto *Op1 = II->getOperand(1);
3533 PrevVecV[0] = Op0;
3534 PrevVecV[1] = Op1;
3535 break;
3536 }
3537 default:
3538 return false;
3539 }
3540 ShouldBeCallOrBinInst ^= 1;
3541
3542 IntrinsicCostAttributes ICA(
3543 *CommonCallOp, II->getType(),
3544 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3545 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3546
3547 // We may need a swap here since it can be (a, b) or (b, a)
3548 // and accordingly change as we go up.
3549 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3550 std::swap(PrevVecV[0], PrevVecV[1]);
3551 InstWorklist.push(PrevVecV[1]);
3552 InstWorklist.push(PrevVecV[0]);
3553 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3554 // Similar logic for bin ops.
3555
3556 if (!ShouldBeCallOrBinInst)
3557 return false;
3558
3559 if (!IsFirstCallOrBinInst &&
3560 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3561 return false;
3562
3563 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3564 return false;
3565 IsFirstCallOrBinInst = false;
3566
3567 if (!CommonBinOp)
3568 CommonBinOp = BinOp->getOpcode();
3569
3570 if (BinOp->getOpcode() != *CommonBinOp)
3571 return false;
3572
3573 switch (*CommonBinOp) {
3574 case BinaryOperator::Add:
3575 case BinaryOperator::Mul:
3576 case BinaryOperator::Or:
3577 case BinaryOperator::And:
3578 case BinaryOperator::Xor: {
3579 auto *Op0 = BinOp->getOperand(0);
3580 auto *Op1 = BinOp->getOperand(1);
3581 PrevVecV[0] = Op0;
3582 PrevVecV[1] = Op1;
3583 break;
3584 }
3585 default:
3586 return false;
3587 }
3588 ShouldBeCallOrBinInst ^= 1;
3589
3590 OrigCost +=
3591 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3592
3593 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3594 std::swap(PrevVecV[0], PrevVecV[1]);
3595 InstWorklist.push(PrevVecV[1]);
3596 InstWorklist.push(PrevVecV[0]);
3597 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3598 // We shouldn't have any null values in the previous vectors,
3599 // is so, there was a mismatch in pattern.
3600 if (ShouldBeCallOrBinInst ||
3601 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3602 return false;
3603
3604 if (SVInst != PrevVecV[1])
3605 return false;
3606
3607 ArrayRef<int> CurMask;
3608 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3609 m_Mask(CurMask))))
3610 return false;
3611
3612 // Subtract the parity mask when checking the condition.
3613 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3614 if (Mask < ShuffleMaskHalf &&
3615 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3616 return false;
3617 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3618 return false;
3619 }
3620
3621 // Update mask values.
3622 ShuffleMaskHalf *= 2;
3623 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3624 ExpectedParityMask >>= 1;
3625
3627 SVInst->getType(), SVInst->getType(),
3628 CurMask, CostKind);
3629
3630 VisitedCnt += 1;
3631 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3632 break;
3633
3634 ShouldBeCallOrBinInst ^= 1;
3635 } else {
3636 return false;
3637 }
3638 }
3639
3640 // Pattern should end with a shuffle op.
3641 if (ShouldBeCallOrBinInst)
3642 return false;
3643
3644 assert(VecSize != -1 && "Expected Match for Vector Size");
3645
3646 Value *FinalVecV = PrevVecV[0];
3647 if (!FinalVecV)
3648 return false;
3649
3650 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3651
3652 Intrinsic::ID ReducedOp =
3653 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3654 : getReductionForBinop(*CommonBinOp));
3655 if (!ReducedOp)
3656 return false;
3657
3658 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3660
3661 if (NewCost >= OrigCost)
3662 return false;
3663
3664 auto *ReducedResult =
3665 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3666 replaceValue(I, *ReducedResult);
3667
3668 return true;
3669}
3670
3671/// Determine if its more efficient to fold:
3672/// reduce(trunc(x)) -> trunc(reduce(x)).
3673/// reduce(sext(x)) -> sext(reduce(x)).
3674/// reduce(zext(x)) -> zext(reduce(x)).
3675bool VectorCombine::foldCastFromReductions(Instruction &I) {
3676 auto *II = dyn_cast<IntrinsicInst>(&I);
3677 if (!II)
3678 return false;
3679
3680 bool TruncOnly = false;
3681 Intrinsic::ID IID = II->getIntrinsicID();
3682 switch (IID) {
3683 case Intrinsic::vector_reduce_add:
3684 case Intrinsic::vector_reduce_mul:
3685 TruncOnly = true;
3686 break;
3687 case Intrinsic::vector_reduce_and:
3688 case Intrinsic::vector_reduce_or:
3689 case Intrinsic::vector_reduce_xor:
3690 break;
3691 default:
3692 return false;
3693 }
3694
3695 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3696 Value *ReductionSrc = I.getOperand(0);
3697
3698 Value *Src;
3699 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3700 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3701 return false;
3702
3703 auto CastOpc =
3704 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3705
3706 auto *SrcTy = cast<VectorType>(Src->getType());
3707 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3708 Type *ResultTy = I.getType();
3709
3711 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3712 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3714 cast<CastInst>(ReductionSrc));
3715 InstructionCost NewCost =
3716 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3717 CostKind) +
3718 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3720
3721 if (OldCost <= NewCost || !NewCost.isValid())
3722 return false;
3723
3724 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3725 II->getIntrinsicID(), {Src});
3726 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3727 replaceValue(I, *NewCast);
3728 return true;
3729}
3730
3731/// Returns true if this ShuffleVectorInst eventually feeds into a
3732/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3733/// chains of shuffles and binary operators (in any combination/order).
3734/// The search does not go deeper than the given Depth.
3736 constexpr unsigned MaxVisited = 32;
3739 bool FoundReduction = false;
3740
3741 WorkList.push_back(SVI);
3742 while (!WorkList.empty()) {
3743 Instruction *I = WorkList.pop_back_val();
3744 for (User *U : I->users()) {
3745 auto *UI = cast<Instruction>(U);
3746 if (!UI || !Visited.insert(UI).second)
3747 continue;
3748 if (Visited.size() > MaxVisited)
3749 return false;
3750 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3751 // More than one reduction reached
3752 if (FoundReduction)
3753 return false;
3754 switch (II->getIntrinsicID()) {
3755 case Intrinsic::vector_reduce_add:
3756 case Intrinsic::vector_reduce_mul:
3757 case Intrinsic::vector_reduce_and:
3758 case Intrinsic::vector_reduce_or:
3759 case Intrinsic::vector_reduce_xor:
3760 case Intrinsic::vector_reduce_smin:
3761 case Intrinsic::vector_reduce_smax:
3762 case Intrinsic::vector_reduce_umin:
3763 case Intrinsic::vector_reduce_umax:
3764 FoundReduction = true;
3765 continue;
3766 default:
3767 return false;
3768 }
3769 }
3770
3772 return false;
3773
3774 WorkList.emplace_back(UI);
3775 }
3776 }
3777 return FoundReduction;
3778}
3779
3780/// This method looks for groups of shuffles acting on binops, of the form:
3781/// %x = shuffle ...
3782/// %y = shuffle ...
3783/// %a = binop %x, %y
3784/// %b = binop %x, %y
3785/// shuffle %a, %b, selectmask
3786/// We may, especially if the shuffle is wider than legal, be able to convert
3787/// the shuffle to a form where only parts of a and b need to be computed. On
3788/// architectures with no obvious "select" shuffle, this can reduce the total
3789/// number of operations if the target reports them as cheaper.
3790bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3791 auto *SVI = cast<ShuffleVectorInst>(&I);
3792 auto *VT = cast<FixedVectorType>(I.getType());
3793 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3794 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3795 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3796 VT != Op0->getType())
3797 return false;
3798
3799 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3800 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3801 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3802 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3803 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3804 auto checkSVNonOpUses = [&](Instruction *I) {
3805 if (!I || I->getOperand(0)->getType() != VT)
3806 return true;
3807 return any_of(I->users(), [&](User *U) {
3808 return U != Op0 && U != Op1 &&
3809 !(isa<ShuffleVectorInst>(U) &&
3810 (InputShuffles.contains(cast<Instruction>(U)) ||
3811 isInstructionTriviallyDead(cast<Instruction>(U))));
3812 });
3813 };
3814 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3815 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3816 return false;
3817
3818 // Collect all the uses that are shuffles that we can transform together. We
3819 // may not have a single shuffle, but a group that can all be transformed
3820 // together profitably.
3822 auto collectShuffles = [&](Instruction *I) {
3823 for (auto *U : I->users()) {
3824 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3825 if (!SV || SV->getType() != VT)
3826 return false;
3827 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3828 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3829 return false;
3830 if (!llvm::is_contained(Shuffles, SV))
3831 Shuffles.push_back(SV);
3832 }
3833 return true;
3834 };
3835 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3836 return false;
3837 // From a reduction, we need to be processing a single shuffle, otherwise the
3838 // other uses will not be lane-invariant.
3839 if (FromReduction && Shuffles.size() > 1)
3840 return false;
3841
3842 // Add any shuffle uses for the shuffles we have found, to include them in our
3843 // cost calculations.
3844 if (!FromReduction) {
3845 for (ShuffleVectorInst *SV : Shuffles) {
3846 for (auto *U : SV->users()) {
3847 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3848 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3849 Shuffles.push_back(SSV);
3850 }
3851 }
3852 }
3853
3854 // For each of the output shuffles, we try to sort all the first vector
3855 // elements to the beginning, followed by the second array elements at the
3856 // end. If the binops are legalized to smaller vectors, this may reduce total
3857 // number of binops. We compute the ReconstructMask mask needed to convert
3858 // back to the original lane order.
3860 SmallVector<SmallVector<int>> OrigReconstructMasks;
3861 int MaxV1Elt = 0, MaxV2Elt = 0;
3862 unsigned NumElts = VT->getNumElements();
3863 for (ShuffleVectorInst *SVN : Shuffles) {
3864 SmallVector<int> Mask;
3865 SVN->getShuffleMask(Mask);
3866
3867 // Check the operands are the same as the original, or reversed (in which
3868 // case we need to commute the mask).
3869 Value *SVOp0 = SVN->getOperand(0);
3870 Value *SVOp1 = SVN->getOperand(1);
3871 if (isa<UndefValue>(SVOp1)) {
3872 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3873 SVOp0 = SSV->getOperand(0);
3874 SVOp1 = SSV->getOperand(1);
3875 for (int &Elem : Mask) {
3876 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3877 return false;
3878 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3879 }
3880 }
3881 if (SVOp0 == Op1 && SVOp1 == Op0) {
3882 std::swap(SVOp0, SVOp1);
3884 }
3885 if (SVOp0 != Op0 || SVOp1 != Op1)
3886 return false;
3887
3888 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3889 // take the packed values from Op0/Op1 and reconstructing to the original
3890 // order.
3891 SmallVector<int> ReconstructMask;
3892 for (unsigned I = 0; I < Mask.size(); I++) {
3893 if (Mask[I] < 0) {
3894 ReconstructMask.push_back(-1);
3895 } else if (Mask[I] < static_cast<int>(NumElts)) {
3896 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3897 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3898 return Mask[I] == A.first;
3899 });
3900 if (It != V1.end())
3901 ReconstructMask.push_back(It - V1.begin());
3902 else {
3903 ReconstructMask.push_back(V1.size());
3904 V1.emplace_back(Mask[I], V1.size());
3905 }
3906 } else {
3907 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3908 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3909 return Mask[I] - static_cast<int>(NumElts) == A.first;
3910 });
3911 if (It != V2.end())
3912 ReconstructMask.push_back(NumElts + It - V2.begin());
3913 else {
3914 ReconstructMask.push_back(NumElts + V2.size());
3915 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3916 }
3917 }
3918 }
3919
3920 // For reductions, we know that the lane ordering out doesn't alter the
3921 // result. In-order can help simplify the shuffle away.
3922 if (FromReduction)
3923 sort(ReconstructMask);
3924 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3925 }
3926
3927 // If the Maximum element used from V1 and V2 are not larger than the new
3928 // vectors, the vectors are already packes and performing the optimization
3929 // again will likely not help any further. This also prevents us from getting
3930 // stuck in a cycle in case the costs do not also rule it out.
3931 if (V1.empty() || V2.empty() ||
3932 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3933 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3934 return false;
3935
3936 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3937 // shuffle of another shuffle, or not a shuffle (that is treated like a
3938 // identity shuffle).
3939 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3940 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3941 if (!SV)
3942 return M;
3943 if (isa<UndefValue>(SV->getOperand(1)))
3944 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3945 if (InputShuffles.contains(SSV))
3946 return SSV->getMaskValue(SV->getMaskValue(M));
3947 return SV->getMaskValue(M);
3948 };
3949
3950 // Attempt to sort the inputs my ascending mask values to make simpler input
3951 // shuffles and push complex shuffles down to the uses. We sort on the first
3952 // of the two input shuffle orders, to try and get at least one input into a
3953 // nice order.
3954 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3955 std::pair<int, int> Y) {
3956 int MXA = GetBaseMaskValue(A, X.first);
3957 int MYA = GetBaseMaskValue(A, Y.first);
3958 return MXA < MYA;
3959 };
3960 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3961 return SortBase(SVI0A, A, B);
3962 });
3963 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3964 return SortBase(SVI1A, A, B);
3965 });
3966 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3967 // modified order of the input shuffles.
3968 SmallVector<SmallVector<int>> ReconstructMasks;
3969 for (const auto &Mask : OrigReconstructMasks) {
3970 SmallVector<int> ReconstructMask;
3971 for (int M : Mask) {
3972 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3973 auto It = find_if(V, [M](auto A) { return A.second == M; });
3974 assert(It != V.end() && "Expected all entries in Mask");
3975 return std::distance(V.begin(), It);
3976 };
3977 if (M < 0)
3978 ReconstructMask.push_back(-1);
3979 else if (M < static_cast<int>(NumElts)) {
3980 ReconstructMask.push_back(FindIndex(V1, M));
3981 } else {
3982 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3983 }
3984 }
3985 ReconstructMasks.push_back(std::move(ReconstructMask));
3986 }
3987
3988 // Calculate the masks needed for the new input shuffles, which get padded
3989 // with undef
3990 SmallVector<int> V1A, V1B, V2A, V2B;
3991 for (unsigned I = 0; I < V1.size(); I++) {
3992 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3993 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3994 }
3995 for (unsigned I = 0; I < V2.size(); I++) {
3996 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3997 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3998 }
3999 while (V1A.size() < NumElts) {
4002 }
4003 while (V2A.size() < NumElts) {
4006 }
4007
4008 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4009 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4010 if (!SV)
4011 return C;
4012 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4015 VT, VT, SV->getShuffleMask(), CostKind);
4016 };
4017 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4018 return C +
4020 };
4021
4022 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4023 unsigned MaxVectorSize =
4025 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4026 if (MaxElementsInVector == 0)
4027 return false;
4028 // When there are multiple shufflevector operations on the same input,
4029 // especially when the vector length is larger than the register size,
4030 // identical shuffle patterns may occur across different groups of elements.
4031 // To avoid overestimating the cost by counting these repeated shuffles more
4032 // than once, we only account for unique shuffle patterns. This adjustment
4033 // prevents inflated costs in the cost model for wide vectors split into
4034 // several register-sized groups.
4035 std::set<SmallVector<int, 4>> UniqueShuffles;
4036 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4037 // Compute the cost for performing the shuffle over the full vector.
4038 auto ShuffleCost =
4040 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4041 if (NumFullVectors < 2)
4042 return C + ShuffleCost;
4043 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4044 unsigned NumUniqueGroups = 0;
4045 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4046 // For each group of MaxElementsInVector contiguous elements,
4047 // collect their shuffle pattern and insert into the set of unique patterns.
4048 for (unsigned I = 0; I < NumFullVectors; ++I) {
4049 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4050 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4051 if (UniqueShuffles.insert(SubShuffle).second)
4052 NumUniqueGroups += 1;
4053 }
4054 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4055 };
4056 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4057 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4058 if (!SV)
4059 return C;
4060 SmallVector<int, 16> Mask;
4061 SV->getShuffleMask(Mask);
4062 return AddShuffleMaskAdjustedCost(C, Mask);
4063 };
4064 // Check that input consists of ShuffleVectors applied to the same input
4065 auto AllShufflesHaveSameOperands =
4066 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4067 if (InputShuffles.size() < 2)
4068 return false;
4069 ShuffleVectorInst *FirstSV =
4070 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4071 if (!FirstSV)
4072 return false;
4073
4074 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4075 return std::all_of(
4076 std::next(InputShuffles.begin()), InputShuffles.end(),
4077 [&](Instruction *I) {
4078 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4079 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4080 });
4081 };
4082
4083 // Get the costs of the shuffles + binops before and after with the new
4084 // shuffle masks.
4085 InstructionCost CostBefore =
4086 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4087 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4088 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4089 InstructionCost(0), AddShuffleCost);
4090 if (AllShufflesHaveSameOperands(InputShuffles)) {
4091 UniqueShuffles.clear();
4092 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4093 InstructionCost(0), AddShuffleAdjustedCost);
4094 } else {
4095 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4096 InstructionCost(0), AddShuffleCost);
4097 }
4098
4099 // The new binops will be unused for lanes past the used shuffle lengths.
4100 // These types attempt to get the correct cost for that from the target.
4101 FixedVectorType *Op0SmallVT =
4102 FixedVectorType::get(VT->getScalarType(), V1.size());
4103 FixedVectorType *Op1SmallVT =
4104 FixedVectorType::get(VT->getScalarType(), V2.size());
4105 InstructionCost CostAfter =
4106 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4107 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4108 UniqueShuffles.clear();
4109 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4110 InstructionCost(0), AddShuffleMaskAdjustedCost);
4111 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4112 CostAfter +=
4113 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4114 InstructionCost(0), AddShuffleMaskCost);
4115
4116 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4117 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4118 << " vs CostAfter: " << CostAfter << "\n");
4119 if (CostBefore < CostAfter ||
4120 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4121 return false;
4122
4123 // The cost model has passed, create the new instructions.
4124 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4125 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4126 if (!SV)
4127 return I;
4128 if (isa<UndefValue>(SV->getOperand(1)))
4129 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4130 if (InputShuffles.contains(SSV))
4131 return SSV->getOperand(Op);
4132 return SV->getOperand(Op);
4133 };
4134 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4135 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4136 GetShuffleOperand(SVI0A, 1), V1A);
4137 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4138 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4139 GetShuffleOperand(SVI0B, 1), V1B);
4140 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4141 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4142 GetShuffleOperand(SVI1A, 1), V2A);
4143 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4144 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4145 GetShuffleOperand(SVI1B, 1), V2B);
4146 Builder.SetInsertPoint(Op0);
4147 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4148 NSV0A, NSV0B);
4149 if (auto *I = dyn_cast<Instruction>(NOp0))
4150 I->copyIRFlags(Op0, true);
4151 Builder.SetInsertPoint(Op1);
4152 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4153 NSV1A, NSV1B);
4154 if (auto *I = dyn_cast<Instruction>(NOp1))
4155 I->copyIRFlags(Op1, true);
4156
4157 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4158 Builder.SetInsertPoint(Shuffles[S]);
4159 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4160 replaceValue(*Shuffles[S], *NSV, false);
4161 }
4162
4163 Worklist.pushValue(NSV0A);
4164 Worklist.pushValue(NSV0B);
4165 Worklist.pushValue(NSV1A);
4166 Worklist.pushValue(NSV1B);
4167 return true;
4168}
4169
4170/// Check if instruction depends on ZExt and this ZExt can be moved after the
4171/// instruction. Move ZExt if it is profitable. For example:
4172/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4173/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4174/// Cost model calculations takes into account if zext(x) has other users and
4175/// whether it can be propagated through them too.
4176bool VectorCombine::shrinkType(Instruction &I) {
4177 Value *ZExted, *OtherOperand;
4178 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4179 m_Value(OtherOperand))) &&
4180 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4181 return false;
4182
4183 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4184
4185 auto *BigTy = cast<FixedVectorType>(I.getType());
4186 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4187 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4188
4189 if (I.getOpcode() == Instruction::LShr) {
4190 // Check that the shift amount is less than the number of bits in the
4191 // smaller type. Otherwise, the smaller lshr will return a poison value.
4192 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4193 if (ShAmtKB.getMaxValue().uge(BW))
4194 return false;
4195 } else {
4196 // Check that the expression overall uses at most the same number of bits as
4197 // ZExted
4198 KnownBits KB = computeKnownBits(&I, *DL);
4199 if (KB.countMaxActiveBits() > BW)
4200 return false;
4201 }
4202
4203 // Calculate costs of leaving current IR as it is and moving ZExt operation
4204 // later, along with adding truncates if needed
4206 Instruction::ZExt, BigTy, SmallTy,
4207 TargetTransformInfo::CastContextHint::None, CostKind);
4208 InstructionCost CurrentCost = ZExtCost;
4209 InstructionCost ShrinkCost = 0;
4210
4211 // Calculate total cost and check that we can propagate through all ZExt users
4212 for (User *U : ZExtOperand->users()) {
4213 auto *UI = cast<Instruction>(U);
4214 if (UI == &I) {
4215 CurrentCost +=
4216 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4217 ShrinkCost +=
4218 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4219 ShrinkCost += ZExtCost;
4220 continue;
4221 }
4222
4223 if (!Instruction::isBinaryOp(UI->getOpcode()))
4224 return false;
4225
4226 // Check if we can propagate ZExt through its other users
4227 KnownBits KB = computeKnownBits(UI, *DL);
4228 if (KB.countMaxActiveBits() > BW)
4229 return false;
4230
4231 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4232 ShrinkCost +=
4233 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4234 ShrinkCost += ZExtCost;
4235 }
4236
4237 // If the other instruction operand is not a constant, we'll need to
4238 // generate a truncate instruction. So we have to adjust cost
4239 if (!isa<Constant>(OtherOperand))
4240 ShrinkCost += TTI.getCastInstrCost(
4241 Instruction::Trunc, SmallTy, BigTy,
4242 TargetTransformInfo::CastContextHint::None, CostKind);
4243
4244 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4245 // towards modifying the IR because shrinking opens opportunities for other
4246 // shrinking optimisations.
4247 if (ShrinkCost > CurrentCost)
4248 return false;
4249
4250 Builder.SetInsertPoint(&I);
4251 Value *Op0 = ZExted;
4252 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4253 // Keep the order of operands the same
4254 if (I.getOperand(0) == OtherOperand)
4255 std::swap(Op0, Op1);
4256 Value *NewBinOp =
4257 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4258 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4259 cast<Instruction>(NewBinOp)->copyMetadata(I);
4260 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4261 replaceValue(I, *NewZExtr);
4262 return true;
4263}
4264
4265/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4266/// shuffle (DstVec, SrcVec, Mask)
4267bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4268 Value *DstVec, *SrcVec;
4269 uint64_t ExtIdx, InsIdx;
4270 if (!match(&I,
4271 m_InsertElt(m_Value(DstVec),
4272 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4273 m_ConstantInt(InsIdx))))
4274 return false;
4275
4276 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4277 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4278 // We can try combining vectors with different element sizes.
4279 if (!DstVecTy || !SrcVecTy ||
4280 SrcVecTy->getElementType() != DstVecTy->getElementType())
4281 return false;
4282
4283 unsigned NumDstElts = DstVecTy->getNumElements();
4284 unsigned NumSrcElts = SrcVecTy->getNumElements();
4285 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4286 return false;
4287
4288 // Insertion into poison is a cheaper single operand shuffle.
4290 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4291
4292 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4293 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4294 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4295 if (NeedDstSrcSwap) {
4297 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4298 Mask[InsIdx] = 0;
4299 else
4300 Mask[InsIdx] = ExtIdx;
4301 std::swap(DstVec, SrcVec);
4302 } else {
4304 std::iota(Mask.begin(), Mask.end(), 0);
4305 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4306 Mask[InsIdx] = NumDstElts;
4307 else
4308 Mask[InsIdx] = ExtIdx + NumDstElts;
4309 }
4310
4311 // Cost
4312 auto *Ins = cast<InsertElementInst>(&I);
4313 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4314 InstructionCost InsCost =
4315 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4316 InstructionCost ExtCost =
4317 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4318 InstructionCost OldCost = ExtCost + InsCost;
4319
4320 InstructionCost NewCost = 0;
4321 SmallVector<int> ExtToVecMask;
4322 if (!NeedExpOrNarrow) {
4323 // Ignore 'free' identity insertion shuffle.
4324 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4325 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4326 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4327 nullptr, {DstVec, SrcVec});
4328 } else {
4329 // When creating length-changing-vector, always create with a Mask whose
4330 // first element has an ExtIdx, so that the first element of the vector
4331 // being created is always the target to be extracted.
4332 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4333 if (IsExtIdxInBounds)
4334 ExtToVecMask[ExtIdx] = ExtIdx;
4335 else
4336 ExtToVecMask[0] = ExtIdx;
4337 // Add cost for expanding or narrowing
4339 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4340 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4341 }
4342
4343 if (!Ext->hasOneUse())
4344 NewCost += ExtCost;
4345
4346 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4347 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4348 << "\n");
4349
4350 if (OldCost < NewCost)
4351 return false;
4352
4353 if (NeedExpOrNarrow) {
4354 if (!NeedDstSrcSwap)
4355 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4356 else
4357 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4358 }
4359
4360 // Canonicalize undef param to RHS to help further folds.
4361 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4362 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4363 std::swap(DstVec, SrcVec);
4364 }
4365
4366 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4367 replaceValue(I, *Shuf);
4368
4369 return true;
4370}
4371
4372/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4373/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4374/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4375/// before casting it back into `<vscale x 16 x i32>`.
4376bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4377 const APInt *SplatVal0, *SplatVal1;
4379 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4380 return false;
4381
4382 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4383 << "\n");
4384
4385 auto *VTy =
4386 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4387 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4388 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4389
4390 // Just in case the cost of interleave2 intrinsic and bitcast are both
4391 // invalid, in which case we want to bail out, we use <= rather
4392 // than < here. Even they both have valid and equal costs, it's probably
4393 // not a good idea to emit a high-cost constant splat.
4395 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4397 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4398 << *I.getType() << " is too high.\n");
4399 return false;
4400 }
4401
4402 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4403 NewSplatVal <<= Width;
4404 NewSplatVal |= SplatVal0->zext(Width * 2);
4405 auto *NewSplat = ConstantVector::getSplat(
4406 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4407
4408 IRBuilder<> Builder(&I);
4409 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4410 return true;
4411}
4412
4413// Attempt to shrink loads that are only used by shufflevector instructions.
4414bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4415 auto *OldLoad = dyn_cast<LoadInst>(&I);
4416 if (!OldLoad || !OldLoad->isSimple())
4417 return false;
4418
4419 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4420 if (!OldLoadTy)
4421 return false;
4422
4423 unsigned const OldNumElements = OldLoadTy->getNumElements();
4424
4425 // Search all uses of load. If all uses are shufflevector instructions, and
4426 // the second operands are all poison values, find the minimum and maximum
4427 // indices of the vector elements referenced by all shuffle masks.
4428 // Otherwise return `std::nullopt`.
4429 using IndexRange = std::pair<int, int>;
4430 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4431 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4432 for (llvm::Use &Use : I.uses()) {
4433 // Ensure all uses match the required pattern.
4434 User *Shuffle = Use.getUser();
4435 ArrayRef<int> Mask;
4436
4437 if (!match(Shuffle,
4438 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4439 return std::nullopt;
4440
4441 // Ignore shufflevector instructions that have no uses.
4442 if (Shuffle->use_empty())
4443 continue;
4444
4445 // Find the min and max indices used by the shufflevector instruction.
4446 for (int Index : Mask) {
4447 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4448 OutputRange.first = std::min(Index, OutputRange.first);
4449 OutputRange.second = std::max(Index, OutputRange.second);
4450 }
4451 }
4452 }
4453
4454 if (OutputRange.second < OutputRange.first)
4455 return std::nullopt;
4456
4457 return OutputRange;
4458 };
4459
4460 // Get the range of vector elements used by shufflevector instructions.
4461 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4462 unsigned const NewNumElements = Indices->second + 1u;
4463
4464 // If the range of vector elements is smaller than the full load, attempt
4465 // to create a smaller load.
4466 if (NewNumElements < OldNumElements) {
4467 IRBuilder Builder(&I);
4468 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4469
4470 // Calculate costs of old and new ops.
4471 Type *ElemTy = OldLoadTy->getElementType();
4472 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4473 Value *PtrOp = OldLoad->getPointerOperand();
4474
4476 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4477 OldLoad->getPointerAddressSpace(), CostKind);
4478 InstructionCost NewCost =
4479 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4480 OldLoad->getPointerAddressSpace(), CostKind);
4481
4482 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4484 unsigned const MaxIndex = NewNumElements * 2u;
4485
4486 for (llvm::Use &Use : I.uses()) {
4487 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4488 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4489
4490 // Create entry for new use.
4491 NewUses.push_back({Shuffle, OldMask});
4492
4493 // Validate mask indices.
4494 for (int Index : OldMask) {
4495 if (Index >= static_cast<int>(MaxIndex))
4496 return false;
4497 }
4498
4499 // Update costs.
4500 OldCost +=
4502 OldLoadTy, OldMask, CostKind);
4503 NewCost +=
4505 NewLoadTy, OldMask, CostKind);
4506 }
4507
4508 LLVM_DEBUG(
4509 dbgs() << "Found a load used only by shufflevector instructions: "
4510 << I << "\n OldCost: " << OldCost
4511 << " vs NewCost: " << NewCost << "\n");
4512
4513 if (OldCost < NewCost || !NewCost.isValid())
4514 return false;
4515
4516 // Create new load of smaller vector.
4517 auto *NewLoad = cast<LoadInst>(
4518 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4519 NewLoad->copyMetadata(I);
4520
4521 // Replace all uses.
4522 for (UseEntry &Use : NewUses) {
4523 ShuffleVectorInst *Shuffle = Use.first;
4524 std::vector<int> &NewMask = Use.second;
4525
4526 Builder.SetInsertPoint(Shuffle);
4527 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4528 Value *NewShuffle = Builder.CreateShuffleVector(
4529 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4530
4531 replaceValue(*Shuffle, *NewShuffle, false);
4532 }
4533
4534 return true;
4535 }
4536 }
4537 return false;
4538}
4539
4540// Attempt to narrow a phi of shufflevector instructions where the two incoming
4541// values have the same operands but different masks. If the two shuffle masks
4542// are offsets of one another we can use one branch to rotate the incoming
4543// vector and perform one larger shuffle after the phi.
4544bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4545 auto *Phi = dyn_cast<PHINode>(&I);
4546 if (!Phi || Phi->getNumIncomingValues() != 2u)
4547 return false;
4548
4549 Value *Op = nullptr;
4550 ArrayRef<int> Mask0;
4551 ArrayRef<int> Mask1;
4552
4553 if (!match(Phi->getOperand(0u),
4554 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4555 !match(Phi->getOperand(1u),
4556 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4557 return false;
4558
4559 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4560
4561 // Ensure result vectors are wider than the argument vector.
4562 auto *InputVT = cast<FixedVectorType>(Op->getType());
4563 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4564 auto const InputNumElements = InputVT->getNumElements();
4565
4566 if (InputNumElements >= ResultVT->getNumElements())
4567 return false;
4568
4569 // Take the difference of the two shuffle masks at each index. Ignore poison
4570 // values at the same index in both masks.
4571 SmallVector<int, 16> NewMask;
4572 NewMask.reserve(Mask0.size());
4573
4574 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4575 if (M0 >= 0 && M1 >= 0)
4576 NewMask.push_back(M0 - M1);
4577 else if (M0 == -1 && M1 == -1)
4578 continue;
4579 else
4580 return false;
4581 }
4582
4583 // Ensure all elements of the new mask are equal. If the difference between
4584 // the incoming mask elements is the same, the two must be constant offsets
4585 // of one another.
4586 if (NewMask.empty() || !all_equal(NewMask))
4587 return false;
4588
4589 // Create new mask using difference of the two incoming masks.
4590 int MaskOffset = NewMask[0u];
4591 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4592 NewMask.clear();
4593
4594 for (unsigned I = 0u; I < InputNumElements; ++I) {
4595 NewMask.push_back(Index);
4596 Index = (Index + 1u) % InputNumElements;
4597 }
4598
4599 // Calculate costs for worst cases and compare.
4600 auto const Kind = TTI::SK_PermuteSingleSrc;
4601 auto OldCost =
4602 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4603 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4604 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4605 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4606
4607 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4608 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4609 << "\n");
4610
4611 if (NewCost > OldCost)
4612 return false;
4613
4614 // Create new shuffles and narrowed phi.
4615 auto Builder = IRBuilder(Shuf);
4616 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4617 auto *PoisonVal = PoisonValue::get(InputVT);
4618 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4619 Worklist.push(cast<Instruction>(NewShuf0));
4620
4621 Builder.SetInsertPoint(Phi);
4622 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4623 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4624 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4625 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4626
4627 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4628 PoisonVal = PoisonValue::get(NewPhi->getType());
4629 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4630
4631 replaceValue(*Phi, *NewShuf1);
4632 return true;
4633}
4634
4635/// This is the entry point for all transforms. Pass manager differences are
4636/// handled in the callers of this function.
4637bool VectorCombine::run() {
4639 return false;
4640
4641 // Don't attempt vectorization if the target does not support vectors.
4642 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4643 return false;
4644
4645 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4646
4647 auto FoldInst = [this](Instruction &I) {
4648 Builder.SetInsertPoint(&I);
4649 bool IsVectorType = isa<VectorType>(I.getType());
4650 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4651 auto Opcode = I.getOpcode();
4652
4653 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4654
4655 // These folds should be beneficial regardless of when this pass is run
4656 // in the optimization pipeline.
4657 // The type checking is for run-time efficiency. We can avoid wasting time
4658 // dispatching to folding functions if there's no chance of matching.
4659 if (IsFixedVectorType) {
4660 switch (Opcode) {
4661 case Instruction::InsertElement:
4662 if (vectorizeLoadInsert(I))
4663 return true;
4664 break;
4665 case Instruction::ShuffleVector:
4666 if (widenSubvectorLoad(I))
4667 return true;
4668 break;
4669 default:
4670 break;
4671 }
4672 }
4673
4674 // This transform works with scalable and fixed vectors
4675 // TODO: Identify and allow other scalable transforms
4676 if (IsVectorType) {
4677 if (scalarizeOpOrCmp(I))
4678 return true;
4679 if (scalarizeLoad(I))
4680 return true;
4681 if (scalarizeExtExtract(I))
4682 return true;
4683 if (scalarizeVPIntrinsic(I))
4684 return true;
4685 if (foldInterleaveIntrinsics(I))
4686 return true;
4687 }
4688
4689 if (Opcode == Instruction::Store)
4690 if (foldSingleElementStore(I))
4691 return true;
4692
4693 // If this is an early pipeline invocation of this pass, we are done.
4694 if (TryEarlyFoldsOnly)
4695 return false;
4696
4697 // Otherwise, try folds that improve codegen but may interfere with
4698 // early IR canonicalizations.
4699 // The type checking is for run-time efficiency. We can avoid wasting time
4700 // dispatching to folding functions if there's no chance of matching.
4701 if (IsFixedVectorType) {
4702 switch (Opcode) {
4703 case Instruction::InsertElement:
4704 if (foldInsExtFNeg(I))
4705 return true;
4706 if (foldInsExtBinop(I))
4707 return true;
4708 if (foldInsExtVectorToShuffle(I))
4709 return true;
4710 break;
4711 case Instruction::ShuffleVector:
4712 if (foldPermuteOfBinops(I))
4713 return true;
4714 if (foldShuffleOfBinops(I))
4715 return true;
4716 if (foldShuffleOfSelects(I))
4717 return true;
4718 if (foldShuffleOfCastops(I))
4719 return true;
4720 if (foldShuffleOfShuffles(I))
4721 return true;
4722 if (foldShuffleOfIntrinsics(I))
4723 return true;
4724 if (foldSelectShuffle(I))
4725 return true;
4726 if (foldShuffleToIdentity(I))
4727 return true;
4728 break;
4729 case Instruction::Load:
4730 if (shrinkLoadForShuffles(I))
4731 return true;
4732 break;
4733 case Instruction::BitCast:
4734 if (foldBitcastShuffle(I))
4735 return true;
4736 break;
4737 case Instruction::And:
4738 case Instruction::Or:
4739 case Instruction::Xor:
4740 if (foldBitOpOfCastops(I))
4741 return true;
4742 if (foldBitOpOfCastConstant(I))
4743 return true;
4744 break;
4745 case Instruction::PHI:
4746 if (shrinkPhiOfShuffles(I))
4747 return true;
4748 break;
4749 default:
4750 if (shrinkType(I))
4751 return true;
4752 break;
4753 }
4754 } else {
4755 switch (Opcode) {
4756 case Instruction::Call:
4757 if (foldShuffleFromReductions(I))
4758 return true;
4759 if (foldCastFromReductions(I))
4760 return true;
4761 break;
4762 case Instruction::ExtractElement:
4763 if (foldShuffleChainsToReduce(I))
4764 return true;
4765 break;
4766 case Instruction::ICmp:
4767 case Instruction::FCmp:
4768 if (foldExtractExtract(I))
4769 return true;
4770 break;
4771 case Instruction::Or:
4772 if (foldConcatOfBoolMasks(I))
4773 return true;
4774 [[fallthrough]];
4775 default:
4776 if (Instruction::isBinaryOp(Opcode)) {
4777 if (foldExtractExtract(I))
4778 return true;
4779 if (foldExtractedCmps(I))
4780 return true;
4781 if (foldBinopOfReductions(I))
4782 return true;
4783 }
4784 break;
4785 }
4786 }
4787 return false;
4788 };
4789
4790 bool MadeChange = false;
4791 for (BasicBlock &BB : F) {
4792 // Ignore unreachable basic blocks.
4793 if (!DT.isReachableFromEntry(&BB))
4794 continue;
4795 // Use early increment range so that we can erase instructions in loop.
4796 // make_early_inc_range is not applicable here, as the next iterator may
4797 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4798 // We manually maintain the next instruction and update it when it is about
4799 // to be deleted.
4800 Instruction *I = &BB.front();
4801 while (I) {
4802 NextInst = I->getNextNode();
4803 if (!I->isDebugOrPseudoInst())
4804 MadeChange |= FoldInst(*I);
4805 I = NextInst;
4806 }
4807 }
4808
4809 NextInst = nullptr;
4810
4811 while (!Worklist.isEmpty()) {
4812 Instruction *I = Worklist.removeOne();
4813 if (!I)
4814 continue;
4815
4818 continue;
4819 }
4820
4821 MadeChange |= FoldInst(*I);
4822 }
4823
4824 return MadeChange;
4825}
4826
4829 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4831 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4832 AAResults &AA = FAM.getResult<AAManager>(F);
4833 const DataLayout *DL = &F.getDataLayout();
4834 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4835 TryEarlyFoldsOnly);
4836 if (!Combiner.run())
4837 return PreservedAnalyses::all();
4840 return PA;
4841}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145