LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
76 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
77 const DataLayout *DL, TTI::TargetCostKind CostKind,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
89 const DominatorTree &DT;
90 AAResults &AA;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoadExtract(Instruction &I);
133 bool scalarizeExtExtract(Instruction &I);
134 bool foldConcatOfBoolMasks(Instruction &I);
135 bool foldPermuteOfBinops(Instruction &I);
136 bool foldShuffleOfBinops(Instruction &I);
137 bool foldShuffleOfSelects(Instruction &I);
138 bool foldShuffleOfCastops(Instruction &I);
139 bool foldShuffleOfShuffles(Instruction &I);
140 bool foldShuffleOfIntrinsics(Instruction &I);
141 bool foldShuffleToIdentity(Instruction &I);
142 bool foldShuffleFromReductions(Instruction &I);
143 bool foldShuffleChainsToReduce(Instruction &I);
144 bool foldCastFromReductions(Instruction &I);
145 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
146 bool foldInterleaveIntrinsics(Instruction &I);
147 bool shrinkType(Instruction &I);
148 bool shrinkLoadForShuffles(Instruction &I);
149 bool shrinkPhiOfShuffles(Instruction &I);
150
151 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
152 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
153 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
154 Old.replaceAllUsesWith(&New);
155 if (auto *NewI = dyn_cast<Instruction>(&New)) {
156 New.takeName(&Old);
157 Worklist.pushUsersToWorkList(*NewI);
158 Worklist.pushValue(NewI);
159 }
160 if (Erase && isInstructionTriviallyDead(&Old)) {
161 eraseInstruction(Old);
162 } else {
163 Worklist.push(&Old);
164 }
165 }
166
168 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
169 SmallVector<Value *> Ops(I.operands());
170 Worklist.remove(&I);
171 I.eraseFromParent();
172
173 // Push remaining users of the operands and then the operand itself - allows
174 // further folds that were hindered by OneUse limits.
176 for (Value *Op : Ops) {
177 if (!Visited.contains(Op)) {
178 if (auto *OpI = dyn_cast<Instruction>(Op)) {
180 OpI, nullptr, nullptr, [&](Value *V) {
181 if (auto *I = dyn_cast<Instruction>(V)) {
182 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
183 Worklist.remove(I);
184 if (I == NextInst)
185 NextInst = NextInst->getNextNode();
186 Visited.insert(I);
187 }
188 }))
189 continue;
190 Worklist.pushUsersToWorkList(*OpI);
191 Worklist.pushValue(OpI);
192 }
193 }
194 }
195 }
196};
197} // namespace
198
199/// Return the source operand of a potentially bitcasted value. If there is no
200/// bitcast, return the input value itself.
202 while (auto *BitCast = dyn_cast<BitCastInst>(V))
203 V = BitCast->getOperand(0);
204 return V;
205}
206
207static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
208 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
209 // The widened load may load data from dirty regions or create data races
210 // non-existent in the source.
211 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
212 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
214 return false;
215
216 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
217 // sure we have all of our type-based constraints in place for this target.
218 Type *ScalarTy = Load->getType()->getScalarType();
219 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
220 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
221 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
222 ScalarSize % 8 != 0)
223 return false;
224
225 return true;
226}
227
228bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
229 // Match insert into fixed vector of scalar value.
230 // TODO: Handle non-zero insert index.
231 Value *Scalar;
232 if (!match(&I,
234 return false;
235
236 // Optionally match an extract from another vector.
237 Value *X;
238 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
239 if (!HasExtract)
240 X = Scalar;
241
242 auto *Load = dyn_cast<LoadInst>(X);
243 if (!canWidenLoad(Load, TTI))
244 return false;
245
246 Type *ScalarTy = Scalar->getType();
247 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
248 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
249
250 // Check safety of replacing the scalar load with a larger vector load.
251 // We use minimal alignment (maximum flexibility) because we only care about
252 // the dereferenceable region. When calculating cost and creating a new op,
253 // we may use a larger value based on alignment attributes.
254 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
255 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
256
257 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
258 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
259 unsigned OffsetEltIndex = 0;
260 Align Alignment = Load->getAlign();
261 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
262 &DT)) {
263 // It is not safe to load directly from the pointer, but we can still peek
264 // through gep offsets and check if it safe to load from a base address with
265 // updated alignment. If it is, we can shuffle the element(s) into place
266 // after loading.
267 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
268 APInt Offset(OffsetBitWidth, 0);
270
271 // We want to shuffle the result down from a high element of a vector, so
272 // the offset must be positive.
273 if (Offset.isNegative())
274 return false;
275
276 // The offset must be a multiple of the scalar element to shuffle cleanly
277 // in the element's size.
278 uint64_t ScalarSizeInBytes = ScalarSize / 8;
279 if (Offset.urem(ScalarSizeInBytes) != 0)
280 return false;
281
282 // If we load MinVecNumElts, will our target element still be loaded?
283 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
284 if (OffsetEltIndex >= MinVecNumElts)
285 return false;
286
287 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
288 &DT))
289 return false;
290
291 // Update alignment with offset value. Note that the offset could be negated
292 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
293 // negation does not change the result of the alignment calculation.
294 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
295 }
296
297 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
298 // Use the greater of the alignment on the load or its source pointer.
299 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
300 Type *LoadTy = Load->getType();
301 unsigned AS = Load->getPointerAddressSpace();
302 InstructionCost OldCost =
303 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
304 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
305 OldCost +=
306 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
307 /* Insert */ true, HasExtract, CostKind);
308
309 // New pattern: load VecPtr
310 InstructionCost NewCost =
311 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
312 // Optionally, we are shuffling the loaded vector element(s) into place.
313 // For the mask set everything but element 0 to undef to prevent poison from
314 // propagating from the extra loaded memory. This will also optionally
315 // shrink/grow the vector from the loaded size to the output size.
316 // We assume this operation has no cost in codegen if there was no offset.
317 // Note that we could use freeze to avoid poison problems, but then we might
318 // still need a shuffle to change the vector size.
319 auto *Ty = cast<FixedVectorType>(I.getType());
320 unsigned OutputNumElts = Ty->getNumElements();
322 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
323 Mask[0] = OffsetEltIndex;
324 if (OffsetEltIndex)
325 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
326 CostKind);
327
328 // We can aggressively convert to the vector form because the backend can
329 // invert this transform if it does not result in a performance win.
330 if (OldCost < NewCost || !NewCost.isValid())
331 return false;
332
333 // It is safe and potentially profitable to load a vector directly:
334 // inselt undef, load Scalar, 0 --> load VecPtr
335 IRBuilder<> Builder(Load);
336 Value *CastedPtr =
337 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
338 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
339 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
340
341 replaceValue(I, *VecLd);
342 ++NumVecLoad;
343 return true;
344}
345
346/// If we are loading a vector and then inserting it into a larger vector with
347/// undefined elements, try to load the larger vector and eliminate the insert.
348/// This removes a shuffle in IR and may allow combining of other loaded values.
349bool VectorCombine::widenSubvectorLoad(Instruction &I) {
350 // Match subvector insert of fixed vector.
351 auto *Shuf = cast<ShuffleVectorInst>(&I);
352 if (!Shuf->isIdentityWithPadding())
353 return false;
354
355 // Allow a non-canonical shuffle mask that is choosing elements from op1.
356 unsigned NumOpElts =
357 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
358 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
359 return M >= (int)(NumOpElts);
360 });
361
362 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
363 if (!canWidenLoad(Load, TTI))
364 return false;
365
366 // We use minimal alignment (maximum flexibility) because we only care about
367 // the dereferenceable region. When calculating cost and creating a new op,
368 // we may use a larger value based on alignment attributes.
369 auto *Ty = cast<FixedVectorType>(I.getType());
370 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
371 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
372 Align Alignment = Load->getAlign();
373 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
374 return false;
375
376 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
377 Type *LoadTy = Load->getType();
378 unsigned AS = Load->getPointerAddressSpace();
379
380 // Original pattern: insert_subvector (load PtrOp)
381 // This conservatively assumes that the cost of a subvector insert into an
382 // undef value is 0. We could add that cost if the cost model accurately
383 // reflects the real cost of that operation.
384 InstructionCost OldCost =
385 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
386
387 // New pattern: load PtrOp
388 InstructionCost NewCost =
389 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
390
391 // We can aggressively convert to the vector form because the backend can
392 // invert this transform if it does not result in a performance win.
393 if (OldCost < NewCost || !NewCost.isValid())
394 return false;
395
396 IRBuilder<> Builder(Load);
397 Value *CastedPtr =
398 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
399 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
400 replaceValue(I, *VecLd);
401 ++NumVecLoad;
402 return true;
403}
404
405/// Determine which, if any, of the inputs should be replaced by a shuffle
406/// followed by extract from a different index.
407ExtractElementInst *VectorCombine::getShuffleExtract(
409 unsigned PreferredExtractIndex = InvalidIndex) const {
410 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
411 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
412 assert(Index0C && Index1C && "Expected constant extract indexes");
413
414 unsigned Index0 = Index0C->getZExtValue();
415 unsigned Index1 = Index1C->getZExtValue();
416
417 // If the extract indexes are identical, no shuffle is needed.
418 if (Index0 == Index1)
419 return nullptr;
420
421 Type *VecTy = Ext0->getVectorOperand()->getType();
422 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
423 InstructionCost Cost0 =
424 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
425 InstructionCost Cost1 =
426 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
427
428 // If both costs are invalid no shuffle is needed
429 if (!Cost0.isValid() && !Cost1.isValid())
430 return nullptr;
431
432 // We are extracting from 2 different indexes, so one operand must be shuffled
433 // before performing a vector operation and/or extract. The more expensive
434 // extract will be replaced by a shuffle.
435 if (Cost0 > Cost1)
436 return Ext0;
437 if (Cost1 > Cost0)
438 return Ext1;
439
440 // If the costs are equal and there is a preferred extract index, shuffle the
441 // opposite operand.
442 if (PreferredExtractIndex == Index0)
443 return Ext1;
444 if (PreferredExtractIndex == Index1)
445 return Ext0;
446
447 // Otherwise, replace the extract with the higher index.
448 return Index0 > Index1 ? Ext0 : Ext1;
449}
450
451/// Compare the relative costs of 2 extracts followed by scalar operation vs.
452/// vector operation(s) followed by extract. Return true if the existing
453/// instructions are cheaper than a vector alternative. Otherwise, return false
454/// and if one of the extracts should be transformed to a shufflevector, set
455/// \p ConvertToShuffle to that extract instruction.
456bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
457 ExtractElementInst *Ext1,
458 const Instruction &I,
459 ExtractElementInst *&ConvertToShuffle,
460 unsigned PreferredExtractIndex) {
461 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
462 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
463 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
464
465 unsigned Opcode = I.getOpcode();
466 Value *Ext0Src = Ext0->getVectorOperand();
467 Value *Ext1Src = Ext1->getVectorOperand();
468 Type *ScalarTy = Ext0->getType();
469 auto *VecTy = cast<VectorType>(Ext0Src->getType());
470 InstructionCost ScalarOpCost, VectorOpCost;
471
472 // Get cost estimates for scalar and vector versions of the operation.
473 bool IsBinOp = Instruction::isBinaryOp(Opcode);
474 if (IsBinOp) {
475 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
476 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
477 } else {
478 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
479 "Expected a compare");
480 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
481 ScalarOpCost = TTI.getCmpSelInstrCost(
482 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
483 VectorOpCost = TTI.getCmpSelInstrCost(
484 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
485 }
486
487 // Get cost estimates for the extract elements. These costs will factor into
488 // both sequences.
489 unsigned Ext0Index = Ext0IndexC->getZExtValue();
490 unsigned Ext1Index = Ext1IndexC->getZExtValue();
491
492 InstructionCost Extract0Cost =
493 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
494 InstructionCost Extract1Cost =
495 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
496
497 // A more expensive extract will always be replaced by a splat shuffle.
498 // For example, if Ext0 is more expensive:
499 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
500 // extelt (opcode (splat V0, Ext0), V1), Ext1
501 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
502 // check the cost of creating a broadcast shuffle and shuffling both
503 // operands to element 0.
504 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
505 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
506 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
507
508 // Extra uses of the extracts mean that we include those costs in the
509 // vector total because those instructions will not be eliminated.
510 InstructionCost OldCost, NewCost;
511 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
512 // Handle a special case. If the 2 extracts are identical, adjust the
513 // formulas to account for that. The extra use charge allows for either the
514 // CSE'd pattern or an unoptimized form with identical values:
515 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
516 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
517 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
518 OldCost = CheapExtractCost + ScalarOpCost;
519 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
520 } else {
521 // Handle the general case. Each extract is actually a different value:
522 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
523 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
524 NewCost = VectorOpCost + CheapExtractCost +
525 !Ext0->hasOneUse() * Extract0Cost +
526 !Ext1->hasOneUse() * Extract1Cost;
527 }
528
529 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
530 if (ConvertToShuffle) {
531 if (IsBinOp && DisableBinopExtractShuffle)
532 return true;
533
534 // If we are extracting from 2 different indexes, then one operand must be
535 // shuffled before performing the vector operation. The shuffle mask is
536 // poison except for 1 lane that is being translated to the remaining
537 // extraction lane. Therefore, it is a splat shuffle. Ex:
538 // ShufMask = { poison, poison, 0, poison }
539 // TODO: The cost model has an option for a "broadcast" shuffle
540 // (splat-from-element-0), but no option for a more general splat.
541 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
542 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
544 ShuffleMask[BestInsIndex] = BestExtIndex;
546 VecTy, VecTy, ShuffleMask, CostKind, 0,
547 nullptr, {ConvertToShuffle});
548 } else {
550 VecTy, VecTy, {}, CostKind, 0, nullptr,
551 {ConvertToShuffle});
552 }
553 }
554
555 // Aggressively form a vector op if the cost is equal because the transform
556 // may enable further optimization.
557 // Codegen can reverse this transform (scalarize) if it was not profitable.
558 return OldCost < NewCost;
559}
560
561/// Create a shuffle that translates (shifts) 1 element from the input vector
562/// to a new element location.
563static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
564 unsigned NewIndex, IRBuilderBase &Builder) {
565 // The shuffle mask is poison except for 1 lane that is being translated
566 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
567 // ShufMask = { 2, poison, poison, poison }
568 auto *VecTy = cast<FixedVectorType>(Vec->getType());
569 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
570 ShufMask[NewIndex] = OldIndex;
571 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
572}
573
574/// Given an extract element instruction with constant index operand, shuffle
575/// the source vector (shift the scalar element) to a NewIndex for extraction.
576/// Return null if the input can be constant folded, so that we are not creating
577/// unnecessary instructions.
578static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
579 IRBuilderBase &Builder) {
580 // Shufflevectors can only be created for fixed-width vectors.
581 Value *X = ExtElt->getVectorOperand();
582 if (!isa<FixedVectorType>(X->getType()))
583 return nullptr;
584
585 // If the extract can be constant-folded, this code is unsimplified. Defer
586 // to other passes to handle that.
587 Value *C = ExtElt->getIndexOperand();
588 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
589 if (isa<Constant>(X))
590 return nullptr;
591
592 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
593 NewIndex, Builder);
594 return Shuf;
595}
596
597/// Try to reduce extract element costs by converting scalar compares to vector
598/// compares followed by extract.
599/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
600Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
601 Instruction &I) {
602 assert(isa<CmpInst>(&I) && "Expected a compare");
603
604 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
605 // --> extelt (cmp Pred V0, V1), ExtIndex
606 ++NumVecCmp;
607 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
608 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
609 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
610}
611
612/// Try to reduce extract element costs by converting scalar binops to vector
613/// binops followed by extract.
614/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
615Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
616 Instruction &I) {
617 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
618
619 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
620 // --> extelt (bo V0, V1), ExtIndex
621 ++NumVecBO;
622 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
623 V1, "foldExtExtBinop");
624
625 // All IR flags are safe to back-propagate because any potential poison
626 // created in unused vector elements is discarded by the extract.
627 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
628 VecBOInst->copyIRFlags(&I);
629
630 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
631}
632
633/// Match an instruction with extracted vector operands.
634bool VectorCombine::foldExtractExtract(Instruction &I) {
635 // It is not safe to transform things like div, urem, etc. because we may
636 // create undefined behavior when executing those on unknown vector elements.
638 return false;
639
640 Instruction *I0, *I1;
642 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
644 return false;
645
646 Value *V0, *V1;
647 uint64_t C0, C1;
648 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
649 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
650 V0->getType() != V1->getType())
651 return false;
652
653 // If the scalar value 'I' is going to be re-inserted into a vector, then try
654 // to create an extract to that same element. The extract/insert can be
655 // reduced to a "select shuffle".
656 // TODO: If we add a larger pattern match that starts from an insert, this
657 // probably becomes unnecessary.
658 auto *Ext0 = cast<ExtractElementInst>(I0);
659 auto *Ext1 = cast<ExtractElementInst>(I1);
660 uint64_t InsertIndex = InvalidIndex;
661 if (I.hasOneUse())
662 match(I.user_back(),
663 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
664
665 ExtractElementInst *ExtractToChange;
666 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
667 return false;
668
669 Value *ExtOp0 = Ext0->getVectorOperand();
670 Value *ExtOp1 = Ext1->getVectorOperand();
671
672 if (ExtractToChange) {
673 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
674 Value *NewExtOp =
675 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
676 if (!NewExtOp)
677 return false;
678 if (ExtractToChange == Ext0)
679 ExtOp0 = NewExtOp;
680 else
681 ExtOp1 = NewExtOp;
682 }
683
684 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
685 : Ext0->getIndexOperand();
686 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
687 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
688 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
689 Worklist.push(Ext0);
690 Worklist.push(Ext1);
691 replaceValue(I, *NewExt);
692 return true;
693}
694
695/// Try to replace an extract + scalar fneg + insert with a vector fneg +
696/// shuffle.
697bool VectorCombine::foldInsExtFNeg(Instruction &I) {
698 // Match an insert (op (extract)) pattern.
699 Value *DestVec;
701 Instruction *FNeg;
702 if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
703 m_ConstantInt(Index))))
704 return false;
705
706 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
707 Value *SrcVec;
708 Instruction *Extract;
709 if (!match(FNeg, m_FNeg(m_CombineAnd(
710 m_Instruction(Extract),
711 m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
712 return false;
713
714 auto *VecTy = cast<FixedVectorType>(I.getType());
715 auto *ScalarTy = VecTy->getScalarType();
716 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
717 if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
718 return false;
719
720 // Ignore bogus insert/extract index.
721 unsigned NumElts = VecTy->getNumElements();
722 if (Index >= NumElts)
723 return false;
724
725 // We are inserting the negated element into the same lane that we extracted
726 // from. This is equivalent to a select-shuffle that chooses all but the
727 // negated element from the destination vector.
728 SmallVector<int> Mask(NumElts);
729 std::iota(Mask.begin(), Mask.end(), 0);
730 Mask[Index] = Index + NumElts;
731 InstructionCost OldCost =
732 TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
733 TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
734
735 // If the extract has one use, it will be eliminated, so count it in the
736 // original cost. If it has more than one use, ignore the cost because it will
737 // be the same before/after.
738 if (Extract->hasOneUse())
739 OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
740
741 InstructionCost NewCost =
742 TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
744 Mask, CostKind);
745
746 bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
747 // If the lengths of the two vectors are not equal,
748 // we need to add a length-change vector. Add this cost.
749 SmallVector<int> SrcMask;
750 if (NeedLenChg) {
751 SrcMask.assign(NumElts, PoisonMaskElem);
752 SrcMask[Index] = Index;
754 VecTy, SrcVecTy, SrcMask, CostKind);
755 }
756
757 if (NewCost > OldCost)
758 return false;
759
760 Value *NewShuf;
761 // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
762 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
763 if (NeedLenChg) {
764 // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
765 Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
766 NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
767 } else {
768 // shuffle DestVec, (fneg SrcVec), Mask
769 NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
770 }
771
772 replaceValue(I, *NewShuf);
773 return true;
774}
775
776/// Try to fold insert(binop(x,y),binop(a,b),idx)
777/// --> binop(insert(x,a,idx),insert(y,b,idx))
778bool VectorCombine::foldInsExtBinop(Instruction &I) {
779 BinaryOperator *VecBinOp, *SclBinOp;
781 if (!match(&I,
782 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
783 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
784 return false;
785
786 // TODO: Add support for addlike etc.
787 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
788 if (BinOpcode != SclBinOp->getOpcode())
789 return false;
790
791 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
792 if (!ResultTy)
793 return false;
794
795 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
796 // shuffle?
797
799 TTI.getInstructionCost(VecBinOp, CostKind) +
801 InstructionCost NewCost =
802 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
803 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
804 Index, VecBinOp->getOperand(0),
805 SclBinOp->getOperand(0)) +
806 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
807 Index, VecBinOp->getOperand(1),
808 SclBinOp->getOperand(1));
809
810 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
811 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
812 << "\n");
813 if (NewCost > OldCost)
814 return false;
815
816 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
817 SclBinOp->getOperand(0), Index);
818 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1), Index);
820 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
821
822 // Intersect flags from the old binops.
823 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
824 NewInst->copyIRFlags(VecBinOp);
825 NewInst->andIRFlags(SclBinOp);
826 }
827
828 Worklist.pushValue(NewIns0);
829 Worklist.pushValue(NewIns1);
830 replaceValue(I, *NewBO);
831 return true;
832}
833
834/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
835/// Supports: bitcast, trunc, sext, zext
836bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
837 // Check if this is a bitwise logic operation
838 auto *BinOp = dyn_cast<BinaryOperator>(&I);
839 if (!BinOp || !BinOp->isBitwiseLogicOp())
840 return false;
841
842 // Get the cast instructions
843 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
844 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
845 if (!LHSCast || !RHSCast) {
846 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
847 return false;
848 }
849
850 // Both casts must be the same type
851 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
852 if (CastOpcode != RHSCast->getOpcode())
853 return false;
854
855 // Only handle supported cast operations
856 switch (CastOpcode) {
857 case Instruction::BitCast:
858 case Instruction::Trunc:
859 case Instruction::SExt:
860 case Instruction::ZExt:
861 break;
862 default:
863 return false;
864 }
865
866 Value *LHSSrc = LHSCast->getOperand(0);
867 Value *RHSSrc = RHSCast->getOperand(0);
868
869 // Source types must match
870 if (LHSSrc->getType() != RHSSrc->getType())
871 return false;
872
873 // Only handle vector types with integer elements
874 auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
875 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
876 if (!SrcVecTy || !DstVecTy)
877 return false;
878
879 if (!SrcVecTy->getScalarType()->isIntegerTy() ||
880 !DstVecTy->getScalarType()->isIntegerTy())
881 return false;
882
883 // Cost Check :
884 // OldCost = bitlogic + 2*casts
885 // NewCost = bitlogic + cast
886
887 // Calculate specific costs for each cast with instruction context
888 InstructionCost LHSCastCost =
889 TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
891 InstructionCost RHSCastCost =
892 TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
894
895 InstructionCost OldCost =
896 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstVecTy, CostKind) +
897 LHSCastCost + RHSCastCost;
898
899 // For new cost, we can't provide an instruction (it doesn't exist yet)
900 InstructionCost GenericCastCost = TTI.getCastInstrCost(
901 CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
902
903 InstructionCost NewCost =
904 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcVecTy, CostKind) +
905 GenericCastCost;
906
907 // Account for multi-use casts using specific costs
908 if (!LHSCast->hasOneUse())
909 NewCost += LHSCastCost;
910 if (!RHSCast->hasOneUse())
911 NewCost += RHSCastCost;
912
913 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
914 << " NewCost=" << NewCost << "\n");
915
916 if (NewCost > OldCost)
917 return false;
918
919 // Create the operation on the source type
920 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
921 BinOp->getName() + ".inner");
922 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
923 NewBinOp->copyIRFlags(BinOp);
924
925 Worklist.pushValue(NewOp);
926
927 // Create the cast operation directly to ensure we get a new instruction
928 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
929
930 // Preserve cast instruction flags
931 NewCast->copyIRFlags(LHSCast);
932 NewCast->andIRFlags(RHSCast);
933
934 // Insert the new instruction
935 Value *Result = Builder.Insert(NewCast);
936
937 replaceValue(I, *Result);
938 return true;
939}
940
942 bool NNeg = false;
943 bool NUW = false;
944 bool NSW = false;
945};
946
947// Try to cast C to InvC losslessly, satisfying CastOp(InvC) == C.
948// Will try best to preserve the flags.
951 const DataLayout &DL,
952 PreservedCastFlags &Flags) {
953 switch (CastOp) {
954 case Instruction::BitCast:
955 // Bitcast is always lossless.
956 return ConstantFoldCastOperand(Instruction::BitCast, C, InvCastTo, DL);
957 case Instruction::Trunc: {
958 auto *ZExtC = ConstantFoldCastOperand(Instruction::ZExt, C, InvCastTo, DL);
959 auto *SExtC = ConstantFoldCastOperand(Instruction::SExt, C, InvCastTo, DL);
960 // Truncation back on ZExt value is always NUW.
961 Flags.NUW = true;
962 // Test positivity of C.
963 Flags.NSW = ZExtC == SExtC;
964 return ZExtC;
965 }
966 case Instruction::SExt:
967 case Instruction::ZExt: {
968 auto *InvC = ConstantExpr::getTrunc(C, InvCastTo);
969 auto *CastInvC = ConstantFoldCastOperand(CastOp, InvC, C->getType(), DL);
970 // Must satisfy CastOp(InvC) == C.
971 if (!CastInvC || CastInvC != C)
972 return nullptr;
973 if (CastOp == Instruction::ZExt) {
974 auto *SExtInvC =
975 ConstantFoldCastOperand(Instruction::SExt, InvC, C->getType(), DL);
976 // Test positivity of InvC.
977 Flags.NNeg = CastInvC == SExtInvC;
978 }
979 return InvC;
980 }
981 default:
982 return nullptr;
983 }
984}
985
986/// Match:
987// bitop(castop(x), C) ->
988// bitop(castop(x), castop(InvC)) ->
989// castop(bitop(x, InvC))
990// Supports: bitcast
991bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
993 Constant *C;
994
995 // Check if this is a bitwise logic operation
997 return false;
998
999 // Get the cast instructions
1000 auto *LHSCast = dyn_cast<CastInst>(LHS);
1001 if (!LHSCast)
1002 return false;
1003
1004 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
1005
1006 // Only handle supported cast operations
1007 switch (CastOpcode) {
1008 case Instruction::BitCast:
1009 break;
1010 default:
1011 return false;
1012 }
1013
1014 Value *LHSSrc = LHSCast->getOperand(0);
1015
1016 // Only handle vector types with integer elements
1017 auto *SrcVecTy = dyn_cast<FixedVectorType>(LHSSrc->getType());
1018 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
1019 if (!SrcVecTy || !DstVecTy)
1020 return false;
1021
1022 if (!SrcVecTy->getScalarType()->isIntegerTy() ||
1023 !DstVecTy->getScalarType()->isIntegerTy())
1024 return false;
1025
1026 // Find the constant InvC, such that castop(InvC) equals to C.
1027 PreservedCastFlags RHSFlags;
1028 Constant *InvC = getLosslessInvCast(C, SrcVecTy, CastOpcode, *DL, RHSFlags);
1029 if (!InvC)
1030 return false;
1031
1032 // Cost Check :
1033 // OldCost = bitlogic + cast
1034 // NewCost = bitlogic + cast
1035
1036 // Calculate specific costs for each cast with instruction context
1037 InstructionCost LHSCastCost =
1038 TTI.getCastInstrCost(CastOpcode, DstVecTy, SrcVecTy,
1040
1041 InstructionCost OldCost =
1042 TTI.getArithmeticInstrCost(I.getOpcode(), DstVecTy, CostKind) +
1043 LHSCastCost;
1044
1045 // For new cost, we can't provide an instruction (it doesn't exist yet)
1046 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1047 CastOpcode, DstVecTy, SrcVecTy, TTI::CastContextHint::None, CostKind);
1048
1049 InstructionCost NewCost =
1050 TTI.getArithmeticInstrCost(I.getOpcode(), SrcVecTy, CostKind) +
1051 GenericCastCost;
1052
1053 // Account for multi-use casts using specific costs
1054 if (!LHSCast->hasOneUse())
1055 NewCost += LHSCastCost;
1056
1057 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1058 << " NewCost=" << NewCost << "\n");
1059
1060 if (NewCost > OldCost)
1061 return false;
1062
1063 // Create the operation on the source type
1064 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1065 LHSSrc, InvC, I.getName() + ".inner");
1066 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1067 NewBinOp->copyIRFlags(&I);
1068
1069 Worklist.pushValue(NewOp);
1070
1071 // Create the cast operation directly to ensure we get a new instruction
1072 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1073
1074 // Insert the new instruction
1075 Value *Result = Builder.Insert(NewCast);
1076
1077 replaceValue(I, *Result);
1078 return true;
1079}
1080
1081/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1082/// destination type followed by shuffle. This can enable further transforms by
1083/// moving bitcasts or shuffles together.
1084bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1085 Value *V0, *V1;
1087 if (!match(&I, m_BitCast(m_OneUse(
1088 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1089 return false;
1090
1091 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1092 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1093 // mask for scalable type is a splat or not.
1094 // 2) Disallow non-vector casts.
1095 // TODO: We could allow any shuffle.
1096 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1097 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1098 if (!DestTy || !SrcTy)
1099 return false;
1100
1101 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1102 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1103 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1104 return false;
1105
1106 bool IsUnary = isa<UndefValue>(V1);
1107
1108 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1109 // if it won't increase the number of bitcasts.
1110 if (!IsUnary) {
1111 auto *BCTy0 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V0)->getType());
1112 auto *BCTy1 = dyn_cast<FixedVectorType>(peekThroughBitcasts(V1)->getType());
1113 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1114 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1115 return false;
1116 }
1117
1118 SmallVector<int, 16> NewMask;
1119 if (DestEltSize <= SrcEltSize) {
1120 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1121 // always be expanded to the equivalent form choosing narrower elements.
1122 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1123 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1124 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1125 } else {
1126 // The bitcast is from narrow elements to wide elements. The shuffle mask
1127 // must choose consecutive elements to allow casting first.
1128 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1129 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1130 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1131 return false;
1132 }
1133
1134 // Bitcast the shuffle src - keep its original width but using the destination
1135 // scalar type.
1136 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1137 auto *NewShuffleTy =
1138 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1139 auto *OldShuffleTy =
1140 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1141 unsigned NumOps = IsUnary ? 1 : 2;
1142
1143 // The new shuffle must not cost more than the old shuffle.
1147
1148 InstructionCost NewCost =
1149 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1150 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1151 TargetTransformInfo::CastContextHint::None,
1152 CostKind));
1153 InstructionCost OldCost =
1154 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1155 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1156 TargetTransformInfo::CastContextHint::None,
1157 CostKind);
1158
1159 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1160 << OldCost << " vs NewCost: " << NewCost << "\n");
1161
1162 if (NewCost > OldCost || !NewCost.isValid())
1163 return false;
1164
1165 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1166 ++NumShufOfBitcast;
1167 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1168 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1169 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1170 replaceValue(I, *Shuf);
1171 return true;
1172}
1173
1174/// VP Intrinsics whose vector operands are both splat values may be simplified
1175/// into the scalar version of the operation and the result splatted. This
1176/// can lead to scalarization down the line.
1177bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1178 if (!isa<VPIntrinsic>(I))
1179 return false;
1180 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1181 Value *Op0 = VPI.getArgOperand(0);
1182 Value *Op1 = VPI.getArgOperand(1);
1183
1184 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1185 return false;
1186
1187 // Check getSplatValue early in this function, to avoid doing unnecessary
1188 // work.
1189 Value *ScalarOp0 = getSplatValue(Op0);
1190 Value *ScalarOp1 = getSplatValue(Op1);
1191 if (!ScalarOp0 || !ScalarOp1)
1192 return false;
1193
1194 // For the binary VP intrinsics supported here, the result on disabled lanes
1195 // is a poison value. For now, only do this simplification if all lanes
1196 // are active.
1197 // TODO: Relax the condition that all lanes are active by using insertelement
1198 // on inactive lanes.
1199 auto IsAllTrueMask = [](Value *MaskVal) {
1200 if (Value *SplattedVal = getSplatValue(MaskVal))
1201 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1202 return ConstValue->isAllOnesValue();
1203 return false;
1204 };
1205 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1206 return false;
1207
1208 // Check to make sure we support scalarization of the intrinsic
1209 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1210 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1211 return false;
1212
1213 // Calculate cost of splatting both operands into vectors and the vector
1214 // intrinsic
1215 VectorType *VecTy = cast<VectorType>(VPI.getType());
1217 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1218 Mask.resize(FVTy->getNumElements(), 0);
1219 InstructionCost SplatCost =
1220 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1222 CostKind);
1223
1224 // Calculate the cost of the VP Intrinsic
1226 for (Value *V : VPI.args())
1227 Args.push_back(V->getType());
1228 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1229 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1230 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1231
1232 // Determine scalar opcode
1233 std::optional<unsigned> FunctionalOpcode =
1234 VPI.getFunctionalOpcode();
1235 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1236 if (!FunctionalOpcode) {
1237 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1238 if (!ScalarIntrID)
1239 return false;
1240 }
1241
1242 // Calculate cost of scalarizing
1243 InstructionCost ScalarOpCost = 0;
1244 if (ScalarIntrID) {
1245 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1246 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1247 } else {
1248 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1249 VecTy->getScalarType(), CostKind);
1250 }
1251
1252 // The existing splats may be kept around if other instructions use them.
1253 InstructionCost CostToKeepSplats =
1254 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1255 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1256
1257 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1258 << "\n");
1259 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1260 << ", Cost of scalarizing:" << NewCost << "\n");
1261
1262 // We want to scalarize unless the vector variant actually has lower cost.
1263 if (OldCost < NewCost || !NewCost.isValid())
1264 return false;
1265
1266 // Scalarize the intrinsic
1267 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1268 Value *EVL = VPI.getArgOperand(3);
1269
1270 // If the VP op might introduce UB or poison, we can scalarize it provided
1271 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1272 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1273 // scalarizing it.
1274 bool SafeToSpeculate;
1275 if (ScalarIntrID)
1276 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1277 .hasAttribute(Attribute::AttrKind::Speculatable);
1278 else
1280 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1281 if (!SafeToSpeculate &&
1282 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1283 return false;
1284
1285 Value *ScalarVal =
1286 ScalarIntrID
1287 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1288 {ScalarOp0, ScalarOp1})
1289 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1290 ScalarOp0, ScalarOp1);
1291
1292 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1293 return true;
1294}
1295
1296/// Match a vector op/compare/intrinsic with at least one
1297/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1298/// by insertelement.
1299bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1300 auto *UO = dyn_cast<UnaryOperator>(&I);
1301 auto *BO = dyn_cast<BinaryOperator>(&I);
1302 auto *CI = dyn_cast<CmpInst>(&I);
1303 auto *II = dyn_cast<IntrinsicInst>(&I);
1304 if (!UO && !BO && !CI && !II)
1305 return false;
1306
1307 // TODO: Allow intrinsics with different argument types
1308 if (II) {
1309 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1310 return false;
1311 for (auto [Idx, Arg] : enumerate(II->args()))
1312 if (Arg->getType() != II->getType() &&
1313 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1314 return false;
1315 }
1316
1317 // Do not convert the vector condition of a vector select into a scalar
1318 // condition. That may cause problems for codegen because of differences in
1319 // boolean formats and register-file transfers.
1320 // TODO: Can we account for that in the cost model?
1321 if (CI)
1322 for (User *U : I.users())
1323 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1324 return false;
1325
1326 // Match constant vectors or scalars being inserted into constant vectors:
1327 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1328 SmallVector<Value *> VecCs, ScalarOps;
1329 std::optional<uint64_t> Index;
1330
1331 auto Ops = II ? II->args() : I.operands();
1332 for (auto [OpNum, Op] : enumerate(Ops)) {
1333 Constant *VecC;
1334 Value *V;
1335 uint64_t InsIdx = 0;
1336 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1337 m_ConstantInt(InsIdx)))) {
1338 // Bail if any inserts are out of bounds.
1339 VectorType *OpTy = cast<VectorType>(Op->getType());
1340 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1341 return false;
1342 // All inserts must have the same index.
1343 // TODO: Deal with mismatched index constants and variable indexes?
1344 if (!Index)
1345 Index = InsIdx;
1346 else if (InsIdx != *Index)
1347 return false;
1348 VecCs.push_back(VecC);
1349 ScalarOps.push_back(V);
1350 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1351 OpNum, &TTI)) {
1352 VecCs.push_back(Op.get());
1353 ScalarOps.push_back(Op.get());
1354 } else if (match(Op.get(), m_Constant(VecC))) {
1355 VecCs.push_back(VecC);
1356 ScalarOps.push_back(nullptr);
1357 } else {
1358 return false;
1359 }
1360 }
1361
1362 // Bail if all operands are constant.
1363 if (!Index.has_value())
1364 return false;
1365
1366 VectorType *VecTy = cast<VectorType>(I.getType());
1367 Type *ScalarTy = VecTy->getScalarType();
1368 assert(VecTy->isVectorTy() &&
1369 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1370 ScalarTy->isPointerTy()) &&
1371 "Unexpected types for insert element into binop or cmp");
1372
1373 unsigned Opcode = I.getOpcode();
1374 InstructionCost ScalarOpCost, VectorOpCost;
1375 if (CI) {
1376 CmpInst::Predicate Pred = CI->getPredicate();
1377 ScalarOpCost = TTI.getCmpSelInstrCost(
1378 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1379 VectorOpCost = TTI.getCmpSelInstrCost(
1380 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1381 } else if (UO || BO) {
1382 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1383 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1384 } else {
1385 IntrinsicCostAttributes ScalarICA(
1386 II->getIntrinsicID(), ScalarTy,
1387 SmallVector<Type *>(II->arg_size(), ScalarTy));
1388 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1389 IntrinsicCostAttributes VectorICA(
1390 II->getIntrinsicID(), VecTy,
1391 SmallVector<Type *>(II->arg_size(), VecTy));
1392 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1393 }
1394
1395 // Fold the vector constants in the original vectors into a new base vector to
1396 // get more accurate cost modelling.
1397 Value *NewVecC = nullptr;
1398 if (CI)
1399 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1400 else if (UO)
1401 NewVecC =
1402 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1403 else if (BO)
1404 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1405 else if (II)
1406 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1407
1408 if (!NewVecC)
1409 return false;
1410
1411 // Get cost estimate for the insert element. This cost will factor into
1412 // both sequences.
1413 InstructionCost OldCost = VectorOpCost;
1414 InstructionCost NewCost =
1415 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1416 CostKind, *Index, NewVecC);
1417
1418 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1419 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1420 II->getIntrinsicID(), Idx, &TTI)))
1421 continue;
1423 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1424 OldCost += InsertCost;
1425 NewCost += !Op->hasOneUse() * InsertCost;
1426 }
1427
1428 // We want to scalarize unless the vector variant actually has lower cost.
1429 if (OldCost < NewCost || !NewCost.isValid())
1430 return false;
1431
1432 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1433 // inselt NewVecC, (scalar_op V0, V1), Index
1434 if (CI)
1435 ++NumScalarCmp;
1436 else if (UO || BO)
1437 ++NumScalarOps;
1438 else
1439 ++NumScalarIntrinsic;
1440
1441 // For constant cases, extract the scalar element, this should constant fold.
1442 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1443 if (!Scalar)
1445 cast<Constant>(VecC), Builder.getInt64(*Index));
1446
1447 Value *Scalar;
1448 if (CI)
1449 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1450 else if (UO || BO)
1451 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1452 else
1453 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1454
1455 Scalar->setName(I.getName() + ".scalar");
1456
1457 // All IR flags are safe to back-propagate. There is no potential for extra
1458 // poison to be created by the scalar instruction.
1459 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1460 ScalarInst->copyIRFlags(&I);
1461
1462 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1463 replaceValue(I, *Insert);
1464 return true;
1465}
1466
1467/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1468/// a vector into vector operations followed by extract. Note: The SLP pass
1469/// may miss this pattern because of implementation problems.
1470bool VectorCombine::foldExtractedCmps(Instruction &I) {
1471 auto *BI = dyn_cast<BinaryOperator>(&I);
1472
1473 // We are looking for a scalar binop of booleans.
1474 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1475 if (!BI || !I.getType()->isIntegerTy(1))
1476 return false;
1477
1478 // The compare predicates should match, and each compare should have a
1479 // constant operand.
1480 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1481 Instruction *I0, *I1;
1482 Constant *C0, *C1;
1484 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1485 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1486 return false;
1487
1488 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1489 if (!MatchingPred)
1490 return false;
1491
1492 // The compare operands must be extracts of the same vector with constant
1493 // extract indexes.
1494 Value *X;
1495 uint64_t Index0, Index1;
1496 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1497 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1498 return false;
1499
1500 auto *Ext0 = cast<ExtractElementInst>(I0);
1501 auto *Ext1 = cast<ExtractElementInst>(I1);
1502 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1503 if (!ConvertToShuf)
1504 return false;
1505 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1506 "Unknown ExtractElementInst");
1507
1508 // The original scalar pattern is:
1509 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1510 CmpInst::Predicate Pred = *MatchingPred;
1511 unsigned CmpOpcode =
1512 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1513 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1514 if (!VecTy)
1515 return false;
1516
1517 InstructionCost Ext0Cost =
1518 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1519 InstructionCost Ext1Cost =
1520 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1522 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1523 CostKind);
1524
1525 InstructionCost OldCost =
1526 Ext0Cost + Ext1Cost + CmpCost * 2 +
1527 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1528
1529 // The proposed vector pattern is:
1530 // vcmp = cmp Pred X, VecC
1531 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1532 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1533 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1534 auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(VecTy));
1536 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1537 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1538 ShufMask[CheapIndex] = ExpensiveIndex;
1540 CmpTy, ShufMask, CostKind);
1541 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1542 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1543 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1544 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1545
1546 // Aggressively form vector ops if the cost is equal because the transform
1547 // may enable further optimization.
1548 // Codegen can reverse this transform (scalarize) if it was not profitable.
1549 if (OldCost < NewCost || !NewCost.isValid())
1550 return false;
1551
1552 // Create a vector constant from the 2 scalar constants.
1553 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1554 PoisonValue::get(VecTy->getElementType()));
1555 CmpC[Index0] = C0;
1556 CmpC[Index1] = C1;
1557 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1558 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1559 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1560 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1561 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1562 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1563 replaceValue(I, *NewExt);
1564 ++NumVecCmpBO;
1565 return true;
1566}
1567
1570 const TargetTransformInfo &TTI,
1571 InstructionCost &CostBeforeReduction,
1572 InstructionCost &CostAfterReduction) {
1573 Instruction *Op0, *Op1;
1574 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1575 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1576 unsigned ReductionOpc =
1577 getArithmeticReductionInstruction(II.getIntrinsicID());
1578 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1579 bool IsUnsigned = isa<ZExtInst>(RedOp);
1580 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1581
1582 CostBeforeReduction =
1583 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1585 CostAfterReduction =
1586 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1587 ExtType, FastMathFlags(), CostKind);
1588 return;
1589 }
1590 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1591 match(RedOp,
1593 match(Op0, m_ZExtOrSExt(m_Value())) &&
1594 Op0->getOpcode() == Op1->getOpcode() &&
1595 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1596 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1597 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1598 bool IsUnsigned = isa<ZExtInst>(Op0);
1599 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1600 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1601
1602 InstructionCost ExtCost =
1603 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1605 InstructionCost MulCost =
1606 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1607 InstructionCost Ext2Cost =
1608 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1610
1611 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1612 CostAfterReduction = TTI.getMulAccReductionCost(
1613 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1614 return;
1615 }
1616 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1617 std::nullopt, CostKind);
1618}
1619
1620bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1621 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1622 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1623 if (BinOpOpc == Instruction::Sub)
1624 ReductionIID = Intrinsic::vector_reduce_add;
1625 if (ReductionIID == Intrinsic::not_intrinsic)
1626 return false;
1627
1628 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1629 Intrinsic::ID IID) -> Value * {
1630 auto *II = dyn_cast<IntrinsicInst>(V);
1631 if (!II)
1632 return nullptr;
1633 if (II->getIntrinsicID() == IID && II->hasOneUse())
1634 return II->getArgOperand(0);
1635 return nullptr;
1636 };
1637
1638 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1639 if (!V0)
1640 return false;
1641 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1642 if (!V1)
1643 return false;
1644
1645 auto *VTy = cast<VectorType>(V0->getType());
1646 if (V1->getType() != VTy)
1647 return false;
1648 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1649 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1650 unsigned ReductionOpc =
1651 getArithmeticReductionInstruction(II0.getIntrinsicID());
1652
1653 InstructionCost OldCost = 0;
1654 InstructionCost NewCost = 0;
1655 InstructionCost CostOfRedOperand0 = 0;
1656 InstructionCost CostOfRed0 = 0;
1657 InstructionCost CostOfRedOperand1 = 0;
1658 InstructionCost CostOfRed1 = 0;
1659 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1660 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1661 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1662 NewCost =
1663 CostOfRedOperand0 + CostOfRedOperand1 +
1664 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1665 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1666 if (NewCost >= OldCost || !NewCost.isValid())
1667 return false;
1668
1669 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1670 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1671 << "\n");
1672 Value *VectorBO;
1673 if (BinOpOpc == Instruction::Or)
1674 VectorBO = Builder.CreateOr(V0, V1, "",
1675 cast<PossiblyDisjointInst>(I).isDisjoint());
1676 else
1677 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1678
1679 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1680 replaceValue(I, *Rdx);
1681 return true;
1682}
1683
1684// Check if memory loc modified between two instrs in the same BB
1687 const MemoryLocation &Loc, AAResults &AA) {
1688 unsigned NumScanned = 0;
1689 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1690 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1691 ++NumScanned > MaxInstrsToScan;
1692 });
1693}
1694
1695namespace {
1696/// Helper class to indicate whether a vector index can be safely scalarized and
1697/// if a freeze needs to be inserted.
1698class ScalarizationResult {
1699 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1700
1701 StatusTy Status;
1702 Value *ToFreeze;
1703
1704 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1705 : Status(Status), ToFreeze(ToFreeze) {}
1706
1707public:
1708 ScalarizationResult(const ScalarizationResult &Other) = default;
1709 ~ScalarizationResult() {
1710 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1711 }
1712
1713 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1714 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1715 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1716 return {StatusTy::SafeWithFreeze, ToFreeze};
1717 }
1718
1719 /// Returns true if the index can be scalarize without requiring a freeze.
1720 bool isSafe() const { return Status == StatusTy::Safe; }
1721 /// Returns true if the index cannot be scalarized.
1722 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1723 /// Returns true if the index can be scalarize, but requires inserting a
1724 /// freeze.
1725 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1726
1727 /// Reset the state of Unsafe and clear ToFreze if set.
1728 void discard() {
1729 ToFreeze = nullptr;
1730 Status = StatusTy::Unsafe;
1731 }
1732
1733 /// Freeze the ToFreeze and update the use in \p User to use it.
1734 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1735 assert(isSafeWithFreeze() &&
1736 "should only be used when freezing is required");
1737 assert(is_contained(ToFreeze->users(), &UserI) &&
1738 "UserI must be a user of ToFreeze");
1739 IRBuilder<>::InsertPointGuard Guard(Builder);
1740 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1741 Value *Frozen =
1742 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1743 for (Use &U : make_early_inc_range((UserI.operands())))
1744 if (U.get() == ToFreeze)
1745 U.set(Frozen);
1746
1747 ToFreeze = nullptr;
1748 }
1749};
1750} // namespace
1751
1752/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1753/// Idx. \p Idx must access a valid vector element.
1754static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1755 Instruction *CtxI,
1756 AssumptionCache &AC,
1757 const DominatorTree &DT) {
1758 // We do checks for both fixed vector types and scalable vector types.
1759 // This is the number of elements of fixed vector types,
1760 // or the minimum number of elements of scalable vector types.
1761 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1762 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1763
1764 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1765 if (C->getValue().ult(NumElements))
1766 return ScalarizationResult::safe();
1767 return ScalarizationResult::unsafe();
1768 }
1769
1770 // Always unsafe if the index type can't handle all inbound values.
1771 if (!llvm::isUIntN(IntWidth, NumElements))
1772 return ScalarizationResult::unsafe();
1773
1774 APInt Zero(IntWidth, 0);
1775 APInt MaxElts(IntWidth, NumElements);
1776 ConstantRange ValidIndices(Zero, MaxElts);
1777 ConstantRange IdxRange(IntWidth, true);
1778
1779 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1780 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1781 true, &AC, CtxI, &DT)))
1782 return ScalarizationResult::safe();
1783 return ScalarizationResult::unsafe();
1784 }
1785
1786 // If the index may be poison, check if we can insert a freeze before the
1787 // range of the index is restricted.
1788 Value *IdxBase;
1789 ConstantInt *CI;
1790 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1791 IdxRange = IdxRange.binaryAnd(CI->getValue());
1792 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1793 IdxRange = IdxRange.urem(CI->getValue());
1794 }
1795
1796 if (ValidIndices.contains(IdxRange))
1797 return ScalarizationResult::safeWithFreeze(IdxBase);
1798 return ScalarizationResult::unsafe();
1799}
1800
1801/// The memory operation on a vector of \p ScalarType had alignment of
1802/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1803/// alignment that will be valid for the memory operation on a single scalar
1804/// element of the same type with index \p Idx.
1806 Type *ScalarType, Value *Idx,
1807 const DataLayout &DL) {
1808 if (auto *C = dyn_cast<ConstantInt>(Idx))
1809 return commonAlignment(VectorAlignment,
1810 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1811 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1812}
1813
1814// Combine patterns like:
1815// %0 = load <4 x i32>, <4 x i32>* %a
1816// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1817// store <4 x i32> %1, <4 x i32>* %a
1818// to:
1819// %0 = bitcast <4 x i32>* %a to i32*
1820// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1821// store i32 %b, i32* %1
1822bool VectorCombine::foldSingleElementStore(Instruction &I) {
1824 return false;
1825 auto *SI = cast<StoreInst>(&I);
1826 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1827 return false;
1828
1829 // TODO: Combine more complicated patterns (multiple insert) by referencing
1830 // TargetTransformInfo.
1832 Value *NewElement;
1833 Value *Idx;
1834 if (!match(SI->getValueOperand(),
1835 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1836 m_Value(Idx))))
1837 return false;
1838
1839 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1840 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1841 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1842 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1843 // modified between, vector type matches store size, and index is inbounds.
1844 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1845 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1846 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1847 return false;
1848
1849 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1850 if (ScalarizableIdx.isUnsafe() ||
1851 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1852 MemoryLocation::get(SI), AA))
1853 return false;
1854
1855 // Ensure we add the load back to the worklist BEFORE its users so they can
1856 // erased in the correct order.
1857 Worklist.push(Load);
1858
1859 if (ScalarizableIdx.isSafeWithFreeze())
1860 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1861 Value *GEP = Builder.CreateInBoundsGEP(
1862 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1863 {ConstantInt::get(Idx->getType(), 0), Idx});
1864 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1865 NSI->copyMetadata(*SI);
1866 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1867 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1868 *DL);
1869 NSI->setAlignment(ScalarOpAlignment);
1870 replaceValue(I, *NSI);
1872 return true;
1873 }
1874
1875 return false;
1876}
1877
1878/// Try to scalarize vector loads feeding extractelement instructions.
1879bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1881 return false;
1882
1883 Value *Ptr;
1884 if (!match(&I, m_Load(m_Value(Ptr))))
1885 return false;
1886
1887 auto *LI = cast<LoadInst>(&I);
1888 auto *VecTy = cast<VectorType>(LI->getType());
1889 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1890 return false;
1891
1892 InstructionCost OriginalCost =
1893 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1894 LI->getPointerAddressSpace(), CostKind);
1895 InstructionCost ScalarizedCost = 0;
1896
1897 Instruction *LastCheckedInst = LI;
1898 unsigned NumInstChecked = 0;
1900 auto FailureGuard = make_scope_exit([&]() {
1901 // If the transform is aborted, discard the ScalarizationResults.
1902 for (auto &Pair : NeedFreeze)
1903 Pair.second.discard();
1904 });
1905
1906 // Check if all users of the load are extracts with no memory modifications
1907 // between the load and the extract. Compute the cost of both the original
1908 // code and the scalarized version.
1909 for (User *U : LI->users()) {
1910 auto *UI = dyn_cast<ExtractElementInst>(U);
1911 if (!UI || UI->getParent() != LI->getParent())
1912 return false;
1913
1914 // If any extract is waiting to be erased, then bail out as this will
1915 // distort the cost calculation and possibly lead to infinite loops.
1916 if (UI->use_empty())
1917 return false;
1918
1919 // Check if any instruction between the load and the extract may modify
1920 // memory.
1921 if (LastCheckedInst->comesBefore(UI)) {
1922 for (Instruction &I :
1923 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1924 // Bail out if we reached the check limit or the instruction may write
1925 // to memory.
1926 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1927 return false;
1928 NumInstChecked++;
1929 }
1930 LastCheckedInst = UI;
1931 }
1932
1933 auto ScalarIdx =
1934 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1935 if (ScalarIdx.isUnsafe())
1936 return false;
1937 if (ScalarIdx.isSafeWithFreeze()) {
1938 NeedFreeze.try_emplace(UI, ScalarIdx);
1939 ScalarIdx.discard();
1940 }
1941
1942 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1943 OriginalCost +=
1944 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1945 Index ? Index->getZExtValue() : -1);
1946 ScalarizedCost +=
1947 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1948 Align(1), LI->getPointerAddressSpace(), CostKind);
1949 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1950 nullptr, nullptr, CostKind);
1951 }
1952
1953 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
1954 << "\n LoadExtractCost: " << OriginalCost
1955 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1956
1957 if (ScalarizedCost >= OriginalCost)
1958 return false;
1959
1960 // Ensure we add the load back to the worklist BEFORE its users so they can
1961 // erased in the correct order.
1962 Worklist.push(LI);
1963
1964 Type *ElemType = VecTy->getElementType();
1965
1966 // Replace extracts with narrow scalar loads.
1967 for (User *U : LI->users()) {
1968 auto *EI = cast<ExtractElementInst>(U);
1969 Value *Idx = EI->getIndexOperand();
1970
1971 // Insert 'freeze' for poison indexes.
1972 auto It = NeedFreeze.find(EI);
1973 if (It != NeedFreeze.end())
1974 It->second.freeze(Builder, *cast<Instruction>(Idx));
1975
1976 Builder.SetInsertPoint(EI);
1977 Value *GEP =
1978 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1979 auto *NewLoad = cast<LoadInst>(
1980 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1981
1982 Align ScalarOpAlignment =
1983 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1984 NewLoad->setAlignment(ScalarOpAlignment);
1985
1986 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1987 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1988 AAMDNodes OldAAMD = LI->getAAMetadata();
1989 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1990 }
1991
1992 replaceValue(*EI, *NewLoad, false);
1993 }
1994
1995 FailureGuard.release();
1996 return true;
1997}
1998
1999bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2001 return false;
2002 auto *Ext = dyn_cast<ZExtInst>(&I);
2003 if (!Ext)
2004 return false;
2005
2006 // Try to convert a vector zext feeding only extracts to a set of scalar
2007 // (Src << ExtIdx *Size) & (Size -1)
2008 // if profitable .
2009 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2010 if (!SrcTy)
2011 return false;
2012 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2013
2014 Type *ScalarDstTy = DstTy->getElementType();
2015 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2016 return false;
2017
2018 InstructionCost VectorCost =
2019 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2021 unsigned ExtCnt = 0;
2022 bool ExtLane0 = false;
2023 for (User *U : Ext->users()) {
2024 uint64_t Idx;
2026 return false;
2027 if (cast<Instruction>(U)->use_empty())
2028 continue;
2029 ExtCnt += 1;
2030 ExtLane0 |= !Idx;
2031 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2032 CostKind, Idx, U);
2033 }
2034
2035 InstructionCost ScalarCost =
2036 ExtCnt * TTI.getArithmeticInstrCost(
2037 Instruction::And, ScalarDstTy, CostKind,
2040 (ExtCnt - ExtLane0) *
2042 Instruction::LShr, ScalarDstTy, CostKind,
2045 if (ScalarCost > VectorCost)
2046 return false;
2047
2048 Value *ScalarV = Ext->getOperand(0);
2049 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2050 &DT))
2051 ScalarV = Builder.CreateFreeze(ScalarV);
2052 ScalarV = Builder.CreateBitCast(
2053 ScalarV,
2054 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2055 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2056 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2057 for (User *U : Ext->users()) {
2058 auto *Extract = cast<ExtractElementInst>(U);
2059 uint64_t Idx =
2060 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2061 Value *LShr = Builder.CreateLShr(ScalarV, Idx * SrcEltSizeInBits);
2062 Value *And = Builder.CreateAnd(LShr, EltBitMask);
2063 U->replaceAllUsesWith(And);
2064 }
2065 return true;
2066}
2067
2068/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2069/// to "(bitcast (concat X, Y))"
2070/// where X/Y are bitcasted from i1 mask vectors.
2071bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2072 Type *Ty = I.getType();
2073 if (!Ty->isIntegerTy())
2074 return false;
2075
2076 // TODO: Add big endian test coverage
2077 if (DL->isBigEndian())
2078 return false;
2079
2080 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2081 Instruction *X, *Y;
2083 return false;
2084
2085 // Allow both sources to contain shl, to handle more generic pattern:
2086 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2087 Value *SrcX;
2088 uint64_t ShAmtX = 0;
2089 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2090 !match(X, m_OneUse(
2092 m_ConstantInt(ShAmtX)))))
2093 return false;
2094
2095 Value *SrcY;
2096 uint64_t ShAmtY = 0;
2097 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2098 !match(Y, m_OneUse(
2100 m_ConstantInt(ShAmtY)))))
2101 return false;
2102
2103 // Canonicalize larger shift to the RHS.
2104 if (ShAmtX > ShAmtY) {
2105 std::swap(X, Y);
2106 std::swap(SrcX, SrcY);
2107 std::swap(ShAmtX, ShAmtY);
2108 }
2109
2110 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2111 // difference is the mask width so they can be easily concatenated together.
2112 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2113 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2114 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2115 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2116 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2117 !MaskTy->getElementType()->isIntegerTy(1) ||
2118 MaskTy->getNumElements() != ShAmtDiff ||
2119 MaskTy->getNumElements() > (BitWidth / 2))
2120 return false;
2121
2122 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2123 auto *ConcatIntTy =
2124 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2125 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2126
2127 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2128 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2129
2130 // TODO: Is it worth supporting multi use cases?
2131 InstructionCost OldCost = 0;
2132 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2133 OldCost +=
2134 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2135 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2137 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2139
2140 InstructionCost NewCost = 0;
2142 MaskTy, ConcatMask, CostKind);
2143 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2145 if (Ty != ConcatIntTy)
2146 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2148 if (ShAmtX > 0)
2149 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2150
2151 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2152 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2153 << "\n");
2154
2155 if (NewCost > OldCost)
2156 return false;
2157
2158 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2159 // any residual zero-extension or shifting.
2160 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2161 Worklist.pushValue(Concat);
2162
2163 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2164
2165 if (Ty != ConcatIntTy) {
2166 Worklist.pushValue(Result);
2167 Result = Builder.CreateZExt(Result, Ty);
2168 }
2169
2170 if (ShAmtX > 0) {
2171 Worklist.pushValue(Result);
2172 Result = Builder.CreateShl(Result, ShAmtX);
2173 }
2174
2175 replaceValue(I, *Result);
2176 return true;
2177}
2178
2179/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2180/// --> "binop (shuffle), (shuffle)".
2181bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2182 BinaryOperator *BinOp;
2183 ArrayRef<int> OuterMask;
2184 if (!match(&I,
2185 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2186 return false;
2187
2188 // Don't introduce poison into div/rem.
2189 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2190 return false;
2191
2192 Value *Op00, *Op01, *Op10, *Op11;
2193 ArrayRef<int> Mask0, Mask1;
2194 bool Match0 =
2195 match(BinOp->getOperand(0),
2196 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2197 bool Match1 =
2198 match(BinOp->getOperand(1),
2199 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2200 if (!Match0 && !Match1)
2201 return false;
2202
2203 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2204 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2205 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2206 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2207
2208 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2209 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2210 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2211 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2212 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2213 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2214 return false;
2215
2216 unsigned NumSrcElts = BinOpTy->getNumElements();
2217
2218 // Don't accept shuffles that reference the second operand in
2219 // div/rem or if its an undef arg.
2220 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2221 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2222 return false;
2223
2224 // Merge outer / inner (or identity if no match) shuffles.
2225 SmallVector<int> NewMask0, NewMask1;
2226 for (int M : OuterMask) {
2227 if (M < 0 || M >= (int)NumSrcElts) {
2228 NewMask0.push_back(PoisonMaskElem);
2229 NewMask1.push_back(PoisonMaskElem);
2230 } else {
2231 NewMask0.push_back(Match0 ? Mask0[M] : M);
2232 NewMask1.push_back(Match1 ? Mask1[M] : M);
2233 }
2234 }
2235
2236 unsigned NumOpElts = Op0Ty->getNumElements();
2237 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2238 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2239 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2240 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2241 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2242 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2243
2244 // Try to merge shuffles across the binop if the new shuffles are not costly.
2245 InstructionCost OldCost =
2246 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2248 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2249 if (Match0)
2250 OldCost += TTI.getShuffleCost(
2251 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2252 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2253 if (Match1)
2254 OldCost += TTI.getShuffleCost(
2255 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2256 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2257
2258 InstructionCost NewCost =
2259 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2260
2261 if (!IsIdentity0)
2262 NewCost +=
2264 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2265 if (!IsIdentity1)
2266 NewCost +=
2268 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2269
2270 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2271 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2272 << "\n");
2273
2274 // If costs are equal, still fold as we reduce instruction count.
2275 if (NewCost > OldCost)
2276 return false;
2277
2278 Value *LHS =
2279 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2280 Value *RHS =
2281 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2282 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2283
2284 // Intersect flags from the old binops.
2285 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2286 NewInst->copyIRFlags(BinOp);
2287
2288 Worklist.pushValue(LHS);
2289 Worklist.pushValue(RHS);
2290 replaceValue(I, *NewBO);
2291 return true;
2292}
2293
2294/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2295/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2296bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2297 ArrayRef<int> OldMask;
2298 Instruction *LHS, *RHS;
2299 if (!match(&I, m_Shuffle(m_OneUse(m_Instruction(LHS)),
2300 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2301 return false;
2302
2303 // TODO: Add support for addlike etc.
2304 if (LHS->getOpcode() != RHS->getOpcode())
2305 return false;
2306
2307 Value *X, *Y, *Z, *W;
2308 bool IsCommutative = false;
2311 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2312 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2313 auto *BO = cast<BinaryOperator>(LHS);
2314 // Don't introduce poison into div/rem.
2315 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2316 return false;
2317 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2318 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2319 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2320 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2321 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2322 } else
2323 return false;
2324
2325 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2326 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2327 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2328 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2329 return false;
2330
2331 unsigned NumSrcElts = BinOpTy->getNumElements();
2332
2333 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2334 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2335 std::swap(X, Y);
2336
2337 auto ConvertToUnary = [NumSrcElts](int &M) {
2338 if (M >= (int)NumSrcElts)
2339 M -= NumSrcElts;
2340 };
2341
2342 SmallVector<int> NewMask0(OldMask);
2344 if (X == Z) {
2345 llvm::for_each(NewMask0, ConvertToUnary);
2347 Z = PoisonValue::get(BinOpTy);
2348 }
2349
2350 SmallVector<int> NewMask1(OldMask);
2352 if (Y == W) {
2353 llvm::for_each(NewMask1, ConvertToUnary);
2355 W = PoisonValue::get(BinOpTy);
2356 }
2357
2358 // Try to replace a binop with a shuffle if the shuffle is not costly.
2359 InstructionCost OldCost =
2363 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2364 &I);
2365
2366 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2367 // where one use shuffles have gotten split across the binop/cmp. These
2368 // often allow a major reduction in total cost that wouldn't happen as
2369 // individual folds.
2370 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2371 TTI::TargetCostKind CostKind) -> bool {
2372 Value *InnerOp;
2373 ArrayRef<int> InnerMask;
2374 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2375 m_Mask(InnerMask)))) &&
2376 InnerOp->getType() == Op->getType() &&
2377 all_of(InnerMask,
2378 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2379 for (int &M : Mask)
2380 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2381 M = InnerMask[M - Offset];
2382 M = 0 <= M ? M + Offset : M;
2383 }
2384 OldCost += TTI.getInstructionCost(cast<Instruction>(Op), CostKind);
2385 Op = InnerOp;
2386 return true;
2387 }
2388 return false;
2389 };
2390 bool ReducedInstCount = false;
2391 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2392 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2393 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2394 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2395
2396 auto *ShuffleCmpTy =
2397 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2398 InstructionCost NewCost =
2399 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2400 nullptr, {X, Z}) +
2401 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2402 nullptr, {Y, W});
2403
2404 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2405 NewCost +=
2406 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2407 } else {
2408 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2409 ShuffleDstTy, PredLHS, CostKind);
2410 }
2411
2412 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2413 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2414 << "\n");
2415
2416 // If either shuffle will constant fold away, then fold for the same cost as
2417 // we will reduce the instruction count.
2418 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2419 (isa<Constant>(Y) && isa<Constant>(W));
2420 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2421 return false;
2422
2423 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2424 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2425 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2426 ? Builder.CreateBinOp(
2427 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2428 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2429
2430 // Intersect flags from the old binops.
2431 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2432 NewInst->copyIRFlags(LHS);
2433 NewInst->andIRFlags(RHS);
2434 }
2435
2436 Worklist.pushValue(Shuf0);
2437 Worklist.pushValue(Shuf1);
2438 replaceValue(I, *NewBO);
2439 return true;
2440}
2441
2442/// Try to convert,
2443/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2444/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2445bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2447 Value *C1, *T1, *F1, *C2, *T2, *F2;
2448 if (!match(&I, m_Shuffle(
2449 m_OneUse(m_Select(m_Value(C1), m_Value(T1), m_Value(F1))),
2450 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2451 m_Mask(Mask))))
2452 return false;
2453
2454 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2455 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2456 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2457 return false;
2458
2459 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2460 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2461 // SelectInsts must have the same FMF.
2462 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2463 ((SI0FOp != nullptr) &&
2464 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2465 return false;
2466
2467 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2468 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2470 auto SelOp = Instruction::Select;
2472 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2473 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2475 OldCost +=
2476 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2477 {I.getOperand(0), I.getOperand(1)}, &I);
2478
2480 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2481 Mask, CostKind, 0, nullptr, {C1, C2});
2482 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2483 nullptr, {T1, T2});
2484 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2485 nullptr, {F1, F2});
2486 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2487 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2488 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2490
2491 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2492 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2493 << "\n");
2494 if (NewCost > OldCost)
2495 return false;
2496
2497 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2498 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2499 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2500 Value *NewSel;
2501 // We presuppose that the SelectInsts have the same FMF.
2502 if (SI0FOp)
2503 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2504 SI0FOp->getFastMathFlags());
2505 else
2506 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2507
2508 Worklist.pushValue(ShuffleCmp);
2509 Worklist.pushValue(ShuffleTrue);
2510 Worklist.pushValue(ShuffleFalse);
2511 replaceValue(I, *NewSel);
2512 return true;
2513}
2514
2515/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2516/// into "castop (shuffle)".
2517bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2518 Value *V0, *V1;
2519 ArrayRef<int> OldMask;
2520 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2521 return false;
2522
2523 auto *C0 = dyn_cast<CastInst>(V0);
2524 auto *C1 = dyn_cast<CastInst>(V1);
2525 if (!C0 || !C1)
2526 return false;
2527
2528 Instruction::CastOps Opcode = C0->getOpcode();
2529 if (C0->getSrcTy() != C1->getSrcTy())
2530 return false;
2531
2532 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2533 if (Opcode != C1->getOpcode()) {
2534 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2535 Opcode = Instruction::SExt;
2536 else
2537 return false;
2538 }
2539
2540 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2541 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2542 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2543 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2544 return false;
2545
2546 unsigned NumSrcElts = CastSrcTy->getNumElements();
2547 unsigned NumDstElts = CastDstTy->getNumElements();
2548 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2549 "Only bitcasts expected to alter src/dst element counts");
2550
2551 // Check for bitcasting of unscalable vector types.
2552 // e.g. <32 x i40> -> <40 x i32>
2553 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2554 (NumDstElts % NumSrcElts) != 0)
2555 return false;
2556
2557 SmallVector<int, 16> NewMask;
2558 if (NumSrcElts >= NumDstElts) {
2559 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2560 // always be expanded to the equivalent form choosing narrower elements.
2561 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2562 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2563 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2564 } else {
2565 // The bitcast is from narrow elements to wide elements. The shuffle mask
2566 // must choose consecutive elements to allow casting first.
2567 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2568 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2569 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2570 return false;
2571 }
2572
2573 auto *NewShuffleDstTy =
2574 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2575
2576 // Try to replace a castop with a shuffle if the shuffle is not costly.
2577 InstructionCost CostC0 =
2578 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2580 InstructionCost CostC1 =
2581 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2583 InstructionCost OldCost = CostC0 + CostC1;
2584 OldCost +=
2586 CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I);
2587
2588 InstructionCost NewCost =
2590 CastSrcTy, NewMask, CostKind);
2591 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2593 if (!C0->hasOneUse())
2594 NewCost += CostC0;
2595 if (!C1->hasOneUse())
2596 NewCost += CostC1;
2597
2598 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2599 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2600 << "\n");
2601 if (NewCost > OldCost)
2602 return false;
2603
2604 Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0),
2605 C1->getOperand(0), NewMask);
2606 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2607
2608 // Intersect flags from the old casts.
2609 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2610 NewInst->copyIRFlags(C0);
2611 NewInst->andIRFlags(C1);
2612 }
2613
2614 Worklist.pushValue(Shuf);
2615 replaceValue(I, *Cast);
2616 return true;
2617}
2618
2619/// Try to convert any of:
2620/// "shuffle (shuffle x, y), (shuffle y, x)"
2621/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2622/// "shuffle (shuffle x, undef), y"
2623/// "shuffle x, (shuffle y, undef)"
2624/// into "shuffle x, y".
2625bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2626 ArrayRef<int> OuterMask;
2627 Value *OuterV0, *OuterV1;
2628 if (!match(&I,
2629 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2630 return false;
2631
2632 ArrayRef<int> InnerMask0, InnerMask1;
2633 Value *X0, *X1, *Y0, *Y1;
2634 bool Match0 =
2635 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2636 bool Match1 =
2637 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2638 if (!Match0 && !Match1)
2639 return false;
2640
2641 // If the outer shuffle is a permute, then create a fake inner all-poison
2642 // shuffle. This is easier than accounting for length-changing shuffles below.
2643 SmallVector<int, 16> PoisonMask1;
2644 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2645 X1 = X0;
2646 Y1 = Y0;
2647 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2648 InnerMask1 = PoisonMask1;
2649 Match1 = true; // fake match
2650 }
2651
2652 X0 = Match0 ? X0 : OuterV0;
2653 Y0 = Match0 ? Y0 : OuterV0;
2654 X1 = Match1 ? X1 : OuterV1;
2655 Y1 = Match1 ? Y1 : OuterV1;
2656 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2657 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2658 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2659 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2660 X0->getType() != X1->getType())
2661 return false;
2662
2663 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2664 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2665
2666 // Attempt to merge shuffles, matching upto 2 source operands.
2667 // Replace index to a poison arg with PoisonMaskElem.
2668 // Bail if either inner masks reference an undef arg.
2669 SmallVector<int, 16> NewMask(OuterMask);
2670 Value *NewX = nullptr, *NewY = nullptr;
2671 for (int &M : NewMask) {
2672 Value *Src = nullptr;
2673 if (0 <= M && M < (int)NumImmElts) {
2674 Src = OuterV0;
2675 if (Match0) {
2676 M = InnerMask0[M];
2677 Src = M >= (int)NumSrcElts ? Y0 : X0;
2678 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2679 }
2680 } else if (M >= (int)NumImmElts) {
2681 Src = OuterV1;
2682 M -= NumImmElts;
2683 if (Match1) {
2684 M = InnerMask1[M];
2685 Src = M >= (int)NumSrcElts ? Y1 : X1;
2686 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2687 }
2688 }
2689 if (Src && M != PoisonMaskElem) {
2690 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2691 if (isa<UndefValue>(Src)) {
2692 // We've referenced an undef element - if its poison, update the shuffle
2693 // mask, else bail.
2694 if (!isa<PoisonValue>(Src))
2695 return false;
2696 M = PoisonMaskElem;
2697 continue;
2698 }
2699 if (!NewX || NewX == Src) {
2700 NewX = Src;
2701 continue;
2702 }
2703 if (!NewY || NewY == Src) {
2704 M += NumSrcElts;
2705 NewY = Src;
2706 continue;
2707 }
2708 return false;
2709 }
2710 }
2711
2712 if (!NewX)
2713 return PoisonValue::get(ShuffleDstTy);
2714 if (!NewY)
2715 NewY = PoisonValue::get(ShuffleSrcTy);
2716
2717 // Have we folded to an Identity shuffle?
2718 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2719 replaceValue(I, *NewX);
2720 return true;
2721 }
2722
2723 // Try to merge the shuffles if the new shuffle is not costly.
2724 InstructionCost InnerCost0 = 0;
2725 if (Match0)
2726 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2727
2728 InstructionCost InnerCost1 = 0;
2729 if (Match1)
2730 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2731
2733
2734 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2735
2736 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2740 InstructionCost NewCost =
2741 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2742 nullptr, {NewX, NewY});
2743 if (!OuterV0->hasOneUse())
2744 NewCost += InnerCost0;
2745 if (!OuterV1->hasOneUse())
2746 NewCost += InnerCost1;
2747
2748 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2749 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2750 << "\n");
2751 if (NewCost > OldCost)
2752 return false;
2753
2754 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2755 replaceValue(I, *Shuf);
2756 return true;
2757}
2758
2759/// Try to convert
2760/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2761bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2762 Value *V0, *V1;
2763 ArrayRef<int> OldMask;
2764 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2765 m_Mask(OldMask))))
2766 return false;
2767
2768 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2769 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2770 if (!II0 || !II1)
2771 return false;
2772
2773 Intrinsic::ID IID = II0->getIntrinsicID();
2774 if (IID != II1->getIntrinsicID())
2775 return false;
2776
2777 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2778 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2779 if (!ShuffleDstTy || !II0Ty)
2780 return false;
2781
2782 if (!isTriviallyVectorizable(IID))
2783 return false;
2784
2785 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2787 II0->getArgOperand(I) != II1->getArgOperand(I))
2788 return false;
2789
2790 InstructionCost OldCost =
2794 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2795
2796 SmallVector<Type *> NewArgsTy;
2797 InstructionCost NewCost = 0;
2798 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2800 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2801 } else {
2802 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2803 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2804 ShuffleDstTy->getNumElements());
2805 NewArgsTy.push_back(ArgTy);
2807 ArgTy, VecTy, OldMask, CostKind);
2808 }
2809 }
2810 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2811 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2812
2813 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2814 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2815 << "\n");
2816
2817 if (NewCost > OldCost)
2818 return false;
2819
2820 SmallVector<Value *> NewArgs;
2821 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2823 NewArgs.push_back(II0->getArgOperand(I));
2824 } else {
2825 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2826 II1->getArgOperand(I), OldMask);
2827 NewArgs.push_back(Shuf);
2828 Worklist.pushValue(Shuf);
2829 }
2830 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2831
2832 // Intersect flags from the old intrinsics.
2833 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2834 NewInst->copyIRFlags(II0);
2835 NewInst->andIRFlags(II1);
2836 }
2837
2838 replaceValue(I, *NewIntrinsic);
2839 return true;
2840}
2841
2842using InstLane = std::pair<Use *, int>;
2843
2844static InstLane lookThroughShuffles(Use *U, int Lane) {
2845 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2846 unsigned NumElts =
2847 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2848 int M = SV->getMaskValue(Lane);
2849 if (M < 0)
2850 return {nullptr, PoisonMaskElem};
2851 if (static_cast<unsigned>(M) < NumElts) {
2852 U = &SV->getOperandUse(0);
2853 Lane = M;
2854 } else {
2855 U = &SV->getOperandUse(1);
2856 Lane = M - NumElts;
2857 }
2858 }
2859 return InstLane{U, Lane};
2860}
2861
2865 for (InstLane IL : Item) {
2866 auto [U, Lane] = IL;
2867 InstLane OpLane =
2868 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2869 Lane)
2870 : InstLane{nullptr, PoisonMaskElem};
2871 NItem.emplace_back(OpLane);
2872 }
2873 return NItem;
2874}
2875
2876/// Detect concat of multiple values into a vector
2878 const TargetTransformInfo &TTI) {
2879 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
2880 unsigned NumElts = Ty->getNumElements();
2881 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
2882 return false;
2883
2884 // Check that the concat is free, usually meaning that the type will be split
2885 // during legalization.
2886 SmallVector<int, 16> ConcatMask(NumElts * 2);
2887 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2889 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
2890 Ty, ConcatMask, CostKind) != 0)
2891 return false;
2892
2893 unsigned NumSlices = Item.size() / NumElts;
2894 // Currently we generate a tree of shuffles for the concats, which limits us
2895 // to a power2.
2896 if (!isPowerOf2_32(NumSlices))
2897 return false;
2898 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
2899 Use *SliceV = Item[Slice * NumElts].first;
2900 if (!SliceV || SliceV->get()->getType() != Ty)
2901 return false;
2902 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
2903 auto [V, Lane] = Item[Slice * NumElts + Elt];
2904 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
2905 return false;
2906 }
2907 }
2908 return true;
2909}
2910
2912 const SmallPtrSet<Use *, 4> &IdentityLeafs,
2913 const SmallPtrSet<Use *, 4> &SplatLeafs,
2914 const SmallPtrSet<Use *, 4> &ConcatLeafs,
2915 IRBuilderBase &Builder,
2916 const TargetTransformInfo *TTI) {
2917 auto [FrontU, FrontLane] = Item.front();
2918
2919 if (IdentityLeafs.contains(FrontU)) {
2920 return FrontU->get();
2921 }
2922 if (SplatLeafs.contains(FrontU)) {
2923 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
2924 return Builder.CreateShuffleVector(FrontU->get(), Mask);
2925 }
2926 if (ConcatLeafs.contains(FrontU)) {
2927 unsigned NumElts =
2928 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
2929 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
2930 for (unsigned S = 0; S < Values.size(); ++S)
2931 Values[S] = Item[S * NumElts].first->get();
2932
2933 while (Values.size() > 1) {
2934 NumElts *= 2;
2935 SmallVector<int, 16> Mask(NumElts, 0);
2936 std::iota(Mask.begin(), Mask.end(), 0);
2937 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
2938 for (unsigned S = 0; S < NewValues.size(); ++S)
2939 NewValues[S] =
2940 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
2941 Values = NewValues;
2942 }
2943 return Values[0];
2944 }
2945
2946 auto *I = cast<Instruction>(FrontU->get());
2947 auto *II = dyn_cast<IntrinsicInst>(I);
2948 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
2949 SmallVector<Value *> Ops(NumOps);
2950 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
2951 if (II &&
2952 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
2953 Ops[Idx] = II->getOperand(Idx);
2954 continue;
2955 }
2957 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
2958 Builder, TTI);
2959 }
2960
2961 SmallVector<Value *, 8> ValueList;
2962 for (const auto &Lane : Item)
2963 if (Lane.first)
2964 ValueList.push_back(Lane.first->get());
2965
2966 Type *DstTy =
2967 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
2968 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
2969 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
2970 Ops[0], Ops[1]);
2971 propagateIRFlags(Value, ValueList);
2972 return Value;
2973 }
2974 if (auto *CI = dyn_cast<CmpInst>(I)) {
2975 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
2976 propagateIRFlags(Value, ValueList);
2977 return Value;
2978 }
2979 if (auto *SI = dyn_cast<SelectInst>(I)) {
2980 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
2981 propagateIRFlags(Value, ValueList);
2982 return Value;
2983 }
2984 if (auto *CI = dyn_cast<CastInst>(I)) {
2985 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
2986 propagateIRFlags(Value, ValueList);
2987 return Value;
2988 }
2989 if (II) {
2990 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
2991 propagateIRFlags(Value, ValueList);
2992 return Value;
2993 }
2994 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
2995 auto *Value =
2996 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
2997 propagateIRFlags(Value, ValueList);
2998 return Value;
2999}
3000
3001// Starting from a shuffle, look up through operands tracking the shuffled index
3002// of each lane. If we can simplify away the shuffles to identities then
3003// do so.
3004bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3005 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3006 if (!Ty || I.use_empty())
3007 return false;
3008
3009 SmallVector<InstLane> Start(Ty->getNumElements());
3010 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3011 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3012
3014 Worklist.push_back(Start);
3015 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3016 unsigned NumVisited = 0;
3017
3018 while (!Worklist.empty()) {
3019 if (++NumVisited > MaxInstrsToScan)
3020 return false;
3021
3022 SmallVector<InstLane> Item = Worklist.pop_back_val();
3023 auto [FrontU, FrontLane] = Item.front();
3024
3025 // If we found an undef first lane then bail out to keep things simple.
3026 if (!FrontU)
3027 return false;
3028
3029 // Helper to peek through bitcasts to the same value.
3030 auto IsEquiv = [&](Value *X, Value *Y) {
3031 return X->getType() == Y->getType() &&
3033 };
3034
3035 // Look for an identity value.
3036 if (FrontLane == 0 &&
3037 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3038 Ty->getNumElements() &&
3039 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3040 Value *FrontV = Item.front().first->get();
3041 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3042 E.value().second == (int)E.index());
3043 })) {
3044 IdentityLeafs.insert(FrontU);
3045 continue;
3046 }
3047 // Look for constants, for the moment only supporting constant splats.
3048 if (auto *C = dyn_cast<Constant>(FrontU);
3049 C && C->getSplatValue() &&
3050 all_of(drop_begin(Item), [Item](InstLane &IL) {
3051 Value *FrontV = Item.front().first->get();
3052 Use *U = IL.first;
3053 return !U || (isa<Constant>(U->get()) &&
3054 cast<Constant>(U->get())->getSplatValue() ==
3055 cast<Constant>(FrontV)->getSplatValue());
3056 })) {
3057 SplatLeafs.insert(FrontU);
3058 continue;
3059 }
3060 // Look for a splat value.
3061 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3062 auto [FrontU, FrontLane] = Item.front();
3063 auto [U, Lane] = IL;
3064 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3065 })) {
3066 SplatLeafs.insert(FrontU);
3067 continue;
3068 }
3069
3070 // We need each element to be the same type of value, and check that each
3071 // element has a single use.
3072 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3073 Value *FrontV = Item.front().first->get();
3074 if (!IL.first)
3075 return true;
3076 Value *V = IL.first->get();
3077 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3078 return false;
3079 if (V->getValueID() != FrontV->getValueID())
3080 return false;
3081 if (auto *CI = dyn_cast<CmpInst>(V))
3082 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3083 return false;
3084 if (auto *CI = dyn_cast<CastInst>(V))
3085 if (CI->getSrcTy()->getScalarType() !=
3086 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3087 return false;
3088 if (auto *SI = dyn_cast<SelectInst>(V))
3089 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3090 SI->getOperand(0)->getType() !=
3091 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3092 return false;
3093 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3094 return false;
3095 auto *II = dyn_cast<IntrinsicInst>(V);
3096 return !II || (isa<IntrinsicInst>(FrontV) &&
3097 II->getIntrinsicID() ==
3098 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3099 !II->hasOperandBundles());
3100 };
3101 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3102 // Check the operator is one that we support.
3103 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3104 // We exclude div/rem in case they hit UB from poison lanes.
3105 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3106 BO && BO->isIntDivRem())
3107 return false;
3110 continue;
3112 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3114 continue;
3115 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3116 // TODO: Handle vector widening/narrowing bitcasts.
3117 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3118 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3119 if (DstTy && SrcTy &&
3120 SrcTy->getNumElements() == DstTy->getNumElements()) {
3122 continue;
3123 }
3124 } else if (isa<SelectInst>(FrontU)) {
3128 continue;
3129 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3130 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3131 !II->hasOperandBundles()) {
3132 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3133 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3134 &TTI)) {
3135 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3136 Value *FrontV = Item.front().first->get();
3137 Use *U = IL.first;
3138 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3139 cast<Instruction>(FrontV)->getOperand(Op));
3140 }))
3141 return false;
3142 continue;
3143 }
3145 }
3146 continue;
3147 }
3148 }
3149
3150 if (isFreeConcat(Item, CostKind, TTI)) {
3151 ConcatLeafs.insert(FrontU);
3152 continue;
3153 }
3154
3155 return false;
3156 }
3157
3158 if (NumVisited <= 1)
3159 return false;
3160
3161 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3162
3163 // If we got this far, we know the shuffles are superfluous and can be
3164 // removed. Scan through again and generate the new tree of instructions.
3165 Builder.SetInsertPoint(&I);
3166 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3167 ConcatLeafs, Builder, &TTI);
3168 replaceValue(I, *V);
3169 return true;
3170}
3171
3172/// Given a commutative reduction, the order of the input lanes does not alter
3173/// the results. We can use this to remove certain shuffles feeding the
3174/// reduction, removing the need to shuffle at all.
3175bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3176 auto *II = dyn_cast<IntrinsicInst>(&I);
3177 if (!II)
3178 return false;
3179 switch (II->getIntrinsicID()) {
3180 case Intrinsic::vector_reduce_add:
3181 case Intrinsic::vector_reduce_mul:
3182 case Intrinsic::vector_reduce_and:
3183 case Intrinsic::vector_reduce_or:
3184 case Intrinsic::vector_reduce_xor:
3185 case Intrinsic::vector_reduce_smin:
3186 case Intrinsic::vector_reduce_smax:
3187 case Intrinsic::vector_reduce_umin:
3188 case Intrinsic::vector_reduce_umax:
3189 break;
3190 default:
3191 return false;
3192 }
3193
3194 // Find all the inputs when looking through operations that do not alter the
3195 // lane order (binops, for example). Currently we look for a single shuffle,
3196 // and can ignore splat values.
3197 std::queue<Value *> Worklist;
3199 ShuffleVectorInst *Shuffle = nullptr;
3200 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3201 Worklist.push(Op);
3202
3203 while (!Worklist.empty()) {
3204 Value *CV = Worklist.front();
3205 Worklist.pop();
3206 if (Visited.contains(CV))
3207 continue;
3208
3209 // Splats don't change the order, so can be safely ignored.
3210 if (isSplatValue(CV))
3211 continue;
3212
3213 Visited.insert(CV);
3214
3215 if (auto *CI = dyn_cast<Instruction>(CV)) {
3216 if (CI->isBinaryOp()) {
3217 for (auto *Op : CI->operand_values())
3218 Worklist.push(Op);
3219 continue;
3220 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3221 if (Shuffle && Shuffle != SV)
3222 return false;
3223 Shuffle = SV;
3224 continue;
3225 }
3226 }
3227
3228 // Anything else is currently an unknown node.
3229 return false;
3230 }
3231
3232 if (!Shuffle)
3233 return false;
3234
3235 // Check all uses of the binary ops and shuffles are also included in the
3236 // lane-invariant operations (Visited should be the list of lanewise
3237 // instructions, including the shuffle that we found).
3238 for (auto *V : Visited)
3239 for (auto *U : V->users())
3240 if (!Visited.contains(U) && U != &I)
3241 return false;
3242
3243 FixedVectorType *VecType =
3244 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3245 if (!VecType)
3246 return false;
3247 FixedVectorType *ShuffleInputType =
3248 dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
3249 if (!ShuffleInputType)
3250 return false;
3251 unsigned NumInputElts = ShuffleInputType->getNumElements();
3252
3253 // Find the mask from sorting the lanes into order. This is most likely to
3254 // become a identity or concat mask. Undef elements are pushed to the end.
3255 SmallVector<int> ConcatMask;
3256 Shuffle->getShuffleMask(ConcatMask);
3257 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3258 bool UsesSecondVec =
3259 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3260
3262 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3263 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3265 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3266 ShuffleInputType, ConcatMask, CostKind);
3267
3268 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3269 << "\n");
3270 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3271 << "\n");
3272 bool MadeChanges = false;
3273 if (NewCost < OldCost) {
3274 Builder.SetInsertPoint(Shuffle);
3275 Value *NewShuffle = Builder.CreateShuffleVector(
3276 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3277 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3278 replaceValue(*Shuffle, *NewShuffle);
3279 return true;
3280 }
3281
3282 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3283 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3284 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3285 return MadeChanges;
3286}
3287
3288/// For a given chain of patterns of the following form:
3289///
3290/// ```
3291/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3292///
3293/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3294/// ty1> %1)
3295/// OR
3296/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3297///
3298/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3299/// ...
3300/// ...
3301/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3302/// 3), <n x ty1> %(i - 2)
3303/// OR
3304/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3305///
3306/// %(i) = extractelement <n x ty1> %(i - 1), 0
3307/// ```
3308///
3309/// Where:
3310/// `mask` follows a partition pattern:
3311///
3312/// Ex:
3313/// [n = 8, p = poison]
3314///
3315/// 4 5 6 7 | p p p p
3316/// 2 3 | p p p p p p
3317/// 1 | p p p p p p p
3318///
3319/// For powers of 2, there's a consistent pattern, but for other cases
3320/// the parity of the current half value at each step decides the
3321/// next partition half (see `ExpectedParityMask` for more logical details
3322/// in generalising this).
3323///
3324/// Ex:
3325/// [n = 6]
3326///
3327/// 3 4 5 | p p p
3328/// 1 2 | p p p p
3329/// 1 | p p p p p
3330bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3331 // Going bottom-up for the pattern.
3332 std::queue<Value *> InstWorklist;
3333 InstructionCost OrigCost = 0;
3334
3335 // Common instruction operation after each shuffle op.
3336 std::optional<unsigned int> CommonCallOp = std::nullopt;
3337 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3338
3339 bool IsFirstCallOrBinInst = true;
3340 bool ShouldBeCallOrBinInst = true;
3341
3342 // This stores the last used instructions for shuffle/common op.
3343 //
3344 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3345 // instructions from either shuffle/common op.
3346 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3347
3348 Value *VecOpEE;
3349 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3350 return false;
3351
3352 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3353 if (!FVT)
3354 return false;
3355
3356 int64_t VecSize = FVT->getNumElements();
3357 if (VecSize < 2)
3358 return false;
3359
3360 // Number of levels would be ~log2(n), considering we always partition
3361 // by half for this fold pattern.
3362 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3363 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3364
3365 // This is how we generalise for all element sizes.
3366 // At each step, if vector size is odd, we need non-poison
3367 // values to cover the dominant half so we don't miss out on any element.
3368 //
3369 // This mask will help us retrieve this as we go from bottom to top:
3370 //
3371 // Mask Set -> N = N * 2 - 1
3372 // Mask Unset -> N = N * 2
3373 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3374 Cur = (Cur + 1) / 2, --Mask) {
3375 if (Cur & 1)
3376 ExpectedParityMask |= (1ll << Mask);
3377 }
3378
3379 InstWorklist.push(VecOpEE);
3380
3381 while (!InstWorklist.empty()) {
3382 Value *CI = InstWorklist.front();
3383 InstWorklist.pop();
3384
3385 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3386 if (!ShouldBeCallOrBinInst)
3387 return false;
3388
3389 if (!IsFirstCallOrBinInst &&
3390 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3391 return false;
3392
3393 // For the first found call/bin op, the vector has to come from the
3394 // extract element op.
3395 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3396 return false;
3397 IsFirstCallOrBinInst = false;
3398
3399 if (!CommonCallOp)
3400 CommonCallOp = II->getIntrinsicID();
3401 if (II->getIntrinsicID() != *CommonCallOp)
3402 return false;
3403
3404 switch (II->getIntrinsicID()) {
3405 case Intrinsic::umin:
3406 case Intrinsic::umax:
3407 case Intrinsic::smin:
3408 case Intrinsic::smax: {
3409 auto *Op0 = II->getOperand(0);
3410 auto *Op1 = II->getOperand(1);
3411 PrevVecV[0] = Op0;
3412 PrevVecV[1] = Op1;
3413 break;
3414 }
3415 default:
3416 return false;
3417 }
3418 ShouldBeCallOrBinInst ^= 1;
3419
3421 *CommonCallOp, II->getType(),
3422 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3423 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3424
3425 // We may need a swap here since it can be (a, b) or (b, a)
3426 // and accordingly change as we go up.
3427 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3428 std::swap(PrevVecV[0], PrevVecV[1]);
3429 InstWorklist.push(PrevVecV[1]);
3430 InstWorklist.push(PrevVecV[0]);
3431 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3432 // Similar logic for bin ops.
3433
3434 if (!ShouldBeCallOrBinInst)
3435 return false;
3436
3437 if (!IsFirstCallOrBinInst &&
3438 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3439 return false;
3440
3441 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3442 return false;
3443 IsFirstCallOrBinInst = false;
3444
3445 if (!CommonBinOp)
3446 CommonBinOp = BinOp->getOpcode();
3447
3448 if (BinOp->getOpcode() != *CommonBinOp)
3449 return false;
3450
3451 switch (*CommonBinOp) {
3452 case BinaryOperator::Add:
3453 case BinaryOperator::Mul:
3454 case BinaryOperator::Or:
3455 case BinaryOperator::And:
3456 case BinaryOperator::Xor: {
3457 auto *Op0 = BinOp->getOperand(0);
3458 auto *Op1 = BinOp->getOperand(1);
3459 PrevVecV[0] = Op0;
3460 PrevVecV[1] = Op1;
3461 break;
3462 }
3463 default:
3464 return false;
3465 }
3466 ShouldBeCallOrBinInst ^= 1;
3467
3468 OrigCost +=
3469 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3470
3471 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3472 std::swap(PrevVecV[0], PrevVecV[1]);
3473 InstWorklist.push(PrevVecV[1]);
3474 InstWorklist.push(PrevVecV[0]);
3475 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3476 // We shouldn't have any null values in the previous vectors,
3477 // is so, there was a mismatch in pattern.
3478 if (ShouldBeCallOrBinInst ||
3479 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3480 return false;
3481
3482 if (SVInst != PrevVecV[1])
3483 return false;
3484
3485 ArrayRef<int> CurMask;
3486 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3487 m_Mask(CurMask))))
3488 return false;
3489
3490 // Subtract the parity mask when checking the condition.
3491 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3492 if (Mask < ShuffleMaskHalf &&
3493 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3494 return false;
3495 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3496 return false;
3497 }
3498
3499 // Update mask values.
3500 ShuffleMaskHalf *= 2;
3501 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3502 ExpectedParityMask >>= 1;
3503
3505 SVInst->getType(), SVInst->getType(),
3506 CurMask, CostKind);
3507
3508 VisitedCnt += 1;
3509 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3510 break;
3511
3512 ShouldBeCallOrBinInst ^= 1;
3513 } else {
3514 return false;
3515 }
3516 }
3517
3518 // Pattern should end with a shuffle op.
3519 if (ShouldBeCallOrBinInst)
3520 return false;
3521
3522 assert(VecSize != -1 && "Expected Match for Vector Size");
3523
3524 Value *FinalVecV = PrevVecV[0];
3525 if (!FinalVecV)
3526 return false;
3527
3528 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3529
3530 Intrinsic::ID ReducedOp =
3531 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3532 : getReductionForBinop(*CommonBinOp));
3533 if (!ReducedOp)
3534 return false;
3535
3536 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3538
3539 if (NewCost >= OrigCost)
3540 return false;
3541
3542 auto *ReducedResult =
3543 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3544 replaceValue(I, *ReducedResult);
3545
3546 return true;
3547}
3548
3549/// Determine if its more efficient to fold:
3550/// reduce(trunc(x)) -> trunc(reduce(x)).
3551/// reduce(sext(x)) -> sext(reduce(x)).
3552/// reduce(zext(x)) -> zext(reduce(x)).
3553bool VectorCombine::foldCastFromReductions(Instruction &I) {
3554 auto *II = dyn_cast<IntrinsicInst>(&I);
3555 if (!II)
3556 return false;
3557
3558 bool TruncOnly = false;
3559 Intrinsic::ID IID = II->getIntrinsicID();
3560 switch (IID) {
3561 case Intrinsic::vector_reduce_add:
3562 case Intrinsic::vector_reduce_mul:
3563 TruncOnly = true;
3564 break;
3565 case Intrinsic::vector_reduce_and:
3566 case Intrinsic::vector_reduce_or:
3567 case Intrinsic::vector_reduce_xor:
3568 break;
3569 default:
3570 return false;
3571 }
3572
3573 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3574 Value *ReductionSrc = I.getOperand(0);
3575
3576 Value *Src;
3577 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3578 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3579 return false;
3580
3581 auto CastOpc =
3582 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3583
3584 auto *SrcTy = cast<VectorType>(Src->getType());
3585 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3586 Type *ResultTy = I.getType();
3587
3589 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3590 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3592 cast<CastInst>(ReductionSrc));
3593 InstructionCost NewCost =
3594 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3595 CostKind) +
3596 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3598
3599 if (OldCost <= NewCost || !NewCost.isValid())
3600 return false;
3601
3602 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3603 II->getIntrinsicID(), {Src});
3604 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3605 replaceValue(I, *NewCast);
3606 return true;
3607}
3608
3609/// Returns true if this ShuffleVectorInst eventually feeds into a
3610/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3611/// chains of shuffles and binary operators (in any combination/order).
3612/// The search does not go deeper than the given Depth.
3614 constexpr unsigned MaxVisited = 32;
3617 bool FoundReduction = false;
3618
3619 WorkList.push_back(SVI);
3620 while (!WorkList.empty()) {
3621 Instruction *I = WorkList.pop_back_val();
3622 for (User *U : I->users()) {
3623 auto *UI = cast<Instruction>(U);
3624 if (!UI || !Visited.insert(UI).second)
3625 continue;
3626 if (Visited.size() > MaxVisited)
3627 return false;
3628 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3629 // More than one reduction reached
3630 if (FoundReduction)
3631 return false;
3632 switch (II->getIntrinsicID()) {
3633 case Intrinsic::vector_reduce_add:
3634 case Intrinsic::vector_reduce_mul:
3635 case Intrinsic::vector_reduce_and:
3636 case Intrinsic::vector_reduce_or:
3637 case Intrinsic::vector_reduce_xor:
3638 case Intrinsic::vector_reduce_smin:
3639 case Intrinsic::vector_reduce_smax:
3640 case Intrinsic::vector_reduce_umin:
3641 case Intrinsic::vector_reduce_umax:
3642 FoundReduction = true;
3643 continue;
3644 default:
3645 return false;
3646 }
3647 }
3648
3649 if (!isa<BinaryOperator>(UI) && !isa<ShuffleVectorInst>(UI))
3650 return false;
3651
3652 WorkList.emplace_back(UI);
3653 }
3654 }
3655 return FoundReduction;
3656}
3657
3658/// This method looks for groups of shuffles acting on binops, of the form:
3659/// %x = shuffle ...
3660/// %y = shuffle ...
3661/// %a = binop %x, %y
3662/// %b = binop %x, %y
3663/// shuffle %a, %b, selectmask
3664/// We may, especially if the shuffle is wider than legal, be able to convert
3665/// the shuffle to a form where only parts of a and b need to be computed. On
3666/// architectures with no obvious "select" shuffle, this can reduce the total
3667/// number of operations if the target reports them as cheaper.
3668bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3669 auto *SVI = cast<ShuffleVectorInst>(&I);
3670 auto *VT = cast<FixedVectorType>(I.getType());
3671 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3672 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3673 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3674 VT != Op0->getType())
3675 return false;
3676
3677 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3678 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3679 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3680 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3681 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3682 auto checkSVNonOpUses = [&](Instruction *I) {
3683 if (!I || I->getOperand(0)->getType() != VT)
3684 return true;
3685 return any_of(I->users(), [&](User *U) {
3686 return U != Op0 && U != Op1 &&
3687 !(isa<ShuffleVectorInst>(U) &&
3688 (InputShuffles.contains(cast<Instruction>(U)) ||
3689 isInstructionTriviallyDead(cast<Instruction>(U))));
3690 });
3691 };
3692 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3693 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3694 return false;
3695
3696 // Collect all the uses that are shuffles that we can transform together. We
3697 // may not have a single shuffle, but a group that can all be transformed
3698 // together profitably.
3700 auto collectShuffles = [&](Instruction *I) {
3701 for (auto *U : I->users()) {
3702 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3703 if (!SV || SV->getType() != VT)
3704 return false;
3705 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3706 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3707 return false;
3708 if (!llvm::is_contained(Shuffles, SV))
3709 Shuffles.push_back(SV);
3710 }
3711 return true;
3712 };
3713 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3714 return false;
3715 // From a reduction, we need to be processing a single shuffle, otherwise the
3716 // other uses will not be lane-invariant.
3717 if (FromReduction && Shuffles.size() > 1)
3718 return false;
3719
3720 // Add any shuffle uses for the shuffles we have found, to include them in our
3721 // cost calculations.
3722 if (!FromReduction) {
3723 for (ShuffleVectorInst *SV : Shuffles) {
3724 for (auto *U : SV->users()) {
3725 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3726 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3727 Shuffles.push_back(SSV);
3728 }
3729 }
3730 }
3731
3732 // For each of the output shuffles, we try to sort all the first vector
3733 // elements to the beginning, followed by the second array elements at the
3734 // end. If the binops are legalized to smaller vectors, this may reduce total
3735 // number of binops. We compute the ReconstructMask mask needed to convert
3736 // back to the original lane order.
3738 SmallVector<SmallVector<int>> OrigReconstructMasks;
3739 int MaxV1Elt = 0, MaxV2Elt = 0;
3740 unsigned NumElts = VT->getNumElements();
3741 for (ShuffleVectorInst *SVN : Shuffles) {
3743 SVN->getShuffleMask(Mask);
3744
3745 // Check the operands are the same as the original, or reversed (in which
3746 // case we need to commute the mask).
3747 Value *SVOp0 = SVN->getOperand(0);
3748 Value *SVOp1 = SVN->getOperand(1);
3749 if (isa<UndefValue>(SVOp1)) {
3750 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3751 SVOp0 = SSV->getOperand(0);
3752 SVOp1 = SSV->getOperand(1);
3753 for (int &Elem : Mask) {
3754 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3755 return false;
3756 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3757 }
3758 }
3759 if (SVOp0 == Op1 && SVOp1 == Op0) {
3760 std::swap(SVOp0, SVOp1);
3762 }
3763 if (SVOp0 != Op0 || SVOp1 != Op1)
3764 return false;
3765
3766 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3767 // take the packed values from Op0/Op1 and reconstructing to the original
3768 // order.
3769 SmallVector<int> ReconstructMask;
3770 for (unsigned I = 0; I < Mask.size(); I++) {
3771 if (Mask[I] < 0) {
3772 ReconstructMask.push_back(-1);
3773 } else if (Mask[I] < static_cast<int>(NumElts)) {
3774 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3775 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3776 return Mask[I] == A.first;
3777 });
3778 if (It != V1.end())
3779 ReconstructMask.push_back(It - V1.begin());
3780 else {
3781 ReconstructMask.push_back(V1.size());
3782 V1.emplace_back(Mask[I], V1.size());
3783 }
3784 } else {
3785 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3786 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3787 return Mask[I] - static_cast<int>(NumElts) == A.first;
3788 });
3789 if (It != V2.end())
3790 ReconstructMask.push_back(NumElts + It - V2.begin());
3791 else {
3792 ReconstructMask.push_back(NumElts + V2.size());
3793 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3794 }
3795 }
3796 }
3797
3798 // For reductions, we know that the lane ordering out doesn't alter the
3799 // result. In-order can help simplify the shuffle away.
3800 if (FromReduction)
3801 sort(ReconstructMask);
3802 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3803 }
3804
3805 // If the Maximum element used from V1 and V2 are not larger than the new
3806 // vectors, the vectors are already packes and performing the optimization
3807 // again will likely not help any further. This also prevents us from getting
3808 // stuck in a cycle in case the costs do not also rule it out.
3809 if (V1.empty() || V2.empty() ||
3810 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3811 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3812 return false;
3813
3814 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3815 // shuffle of another shuffle, or not a shuffle (that is treated like a
3816 // identity shuffle).
3817 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3818 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3819 if (!SV)
3820 return M;
3821 if (isa<UndefValue>(SV->getOperand(1)))
3822 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3823 if (InputShuffles.contains(SSV))
3824 return SSV->getMaskValue(SV->getMaskValue(M));
3825 return SV->getMaskValue(M);
3826 };
3827
3828 // Attempt to sort the inputs my ascending mask values to make simpler input
3829 // shuffles and push complex shuffles down to the uses. We sort on the first
3830 // of the two input shuffle orders, to try and get at least one input into a
3831 // nice order.
3832 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3833 std::pair<int, int> Y) {
3834 int MXA = GetBaseMaskValue(A, X.first);
3835 int MYA = GetBaseMaskValue(A, Y.first);
3836 return MXA < MYA;
3837 };
3838 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3839 return SortBase(SVI0A, A, B);
3840 });
3841 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3842 return SortBase(SVI1A, A, B);
3843 });
3844 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3845 // modified order of the input shuffles.
3846 SmallVector<SmallVector<int>> ReconstructMasks;
3847 for (const auto &Mask : OrigReconstructMasks) {
3848 SmallVector<int> ReconstructMask;
3849 for (int M : Mask) {
3850 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3851 auto It = find_if(V, [M](auto A) { return A.second == M; });
3852 assert(It != V.end() && "Expected all entries in Mask");
3853 return std::distance(V.begin(), It);
3854 };
3855 if (M < 0)
3856 ReconstructMask.push_back(-1);
3857 else if (M < static_cast<int>(NumElts)) {
3858 ReconstructMask.push_back(FindIndex(V1, M));
3859 } else {
3860 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3861 }
3862 }
3863 ReconstructMasks.push_back(std::move(ReconstructMask));
3864 }
3865
3866 // Calculate the masks needed for the new input shuffles, which get padded
3867 // with undef
3868 SmallVector<int> V1A, V1B, V2A, V2B;
3869 for (unsigned I = 0; I < V1.size(); I++) {
3870 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3871 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3872 }
3873 for (unsigned I = 0; I < V2.size(); I++) {
3874 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3875 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3876 }
3877 while (V1A.size() < NumElts) {
3880 }
3881 while (V2A.size() < NumElts) {
3884 }
3885
3886 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
3887 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3888 if (!SV)
3889 return C;
3890 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
3893 VT, VT, SV->getShuffleMask(), CostKind);
3894 };
3895 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3896 return C +
3898 };
3899
3900 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
3901 unsigned MaxVectorSize =
3903 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
3904 // When there are multiple shufflevector operations on the same input,
3905 // especially when the vector length is larger than the register size,
3906 // identical shuffle patterns may occur across different groups of elements.
3907 // To avoid overestimating the cost by counting these repeated shuffles more
3908 // than once, we only account for unique shuffle patterns. This adjustment
3909 // prevents inflated costs in the cost model for wide vectors split into
3910 // several register-sized groups.
3911 std::set<SmallVector<int, 4>> UniqueShuffles;
3912 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3913 // Compute the cost for performing the shuffle over the full vector.
3914 auto ShuffleCost =
3916 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
3917 if (NumFullVectors < 2)
3918 return C + ShuffleCost;
3919 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
3920 unsigned NumUniqueGroups = 0;
3921 unsigned NumGroups = Mask.size() / MaxElementsInVector;
3922 // For each group of MaxElementsInVector contiguous elements,
3923 // collect their shuffle pattern and insert into the set of unique patterns.
3924 for (unsigned I = 0; I < NumFullVectors; ++I) {
3925 for (unsigned J = 0; J < MaxElementsInVector; ++J)
3926 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
3927 if (UniqueShuffles.insert(SubShuffle).second)
3928 NumUniqueGroups += 1;
3929 }
3930 return C + ShuffleCost * NumUniqueGroups / NumGroups;
3931 };
3932 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
3933 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3934 if (!SV)
3935 return C;
3937 SV->getShuffleMask(Mask);
3938 return AddShuffleMaskAdjustedCost(C, Mask);
3939 };
3940 // Check that input consists of ShuffleVectors applied to the same input
3941 auto AllShufflesHaveSameOperands =
3942 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
3943 if (InputShuffles.size() < 2)
3944 return false;
3945 ShuffleVectorInst *FirstSV =
3946 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
3947 if (!FirstSV)
3948 return false;
3949
3950 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
3951 return std::all_of(
3952 std::next(InputShuffles.begin()), InputShuffles.end(),
3953 [&](Instruction *I) {
3954 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
3955 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
3956 });
3957 };
3958
3959 // Get the costs of the shuffles + binops before and after with the new
3960 // shuffle masks.
3961 InstructionCost CostBefore =
3962 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
3963 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
3964 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
3965 InstructionCost(0), AddShuffleCost);
3966 if (AllShufflesHaveSameOperands(InputShuffles)) {
3967 UniqueShuffles.clear();
3968 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3969 InstructionCost(0), AddShuffleAdjustedCost);
3970 } else {
3971 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
3972 InstructionCost(0), AddShuffleCost);
3973 }
3974
3975 // The new binops will be unused for lanes past the used shuffle lengths.
3976 // These types attempt to get the correct cost for that from the target.
3977 FixedVectorType *Op0SmallVT =
3978 FixedVectorType::get(VT->getScalarType(), V1.size());
3979 FixedVectorType *Op1SmallVT =
3980 FixedVectorType::get(VT->getScalarType(), V2.size());
3981 InstructionCost CostAfter =
3982 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
3983 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
3984 UniqueShuffles.clear();
3985 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
3986 InstructionCost(0), AddShuffleMaskAdjustedCost);
3987 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
3988 CostAfter +=
3989 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
3990 InstructionCost(0), AddShuffleMaskCost);
3991
3992 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
3993 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
3994 << " vs CostAfter: " << CostAfter << "\n");
3995 if (CostBefore < CostAfter ||
3996 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
3997 return false;
3998
3999 // The cost model has passed, create the new instructions.
4000 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4001 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4002 if (!SV)
4003 return I;
4004 if (isa<UndefValue>(SV->getOperand(1)))
4005 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4006 if (InputShuffles.contains(SSV))
4007 return SSV->getOperand(Op);
4008 return SV->getOperand(Op);
4009 };
4010 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4011 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4012 GetShuffleOperand(SVI0A, 1), V1A);
4013 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4014 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4015 GetShuffleOperand(SVI0B, 1), V1B);
4016 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4017 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4018 GetShuffleOperand(SVI1A, 1), V2A);
4019 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4020 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4021 GetShuffleOperand(SVI1B, 1), V2B);
4022 Builder.SetInsertPoint(Op0);
4023 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4024 NSV0A, NSV0B);
4025 if (auto *I = dyn_cast<Instruction>(NOp0))
4026 I->copyIRFlags(Op0, true);
4027 Builder.SetInsertPoint(Op1);
4028 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4029 NSV1A, NSV1B);
4030 if (auto *I = dyn_cast<Instruction>(NOp1))
4031 I->copyIRFlags(Op1, true);
4032
4033 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4034 Builder.SetInsertPoint(Shuffles[S]);
4035 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4036 replaceValue(*Shuffles[S], *NSV, false);
4037 }
4038
4039 Worklist.pushValue(NSV0A);
4040 Worklist.pushValue(NSV0B);
4041 Worklist.pushValue(NSV1A);
4042 Worklist.pushValue(NSV1B);
4043 return true;
4044}
4045
4046/// Check if instruction depends on ZExt and this ZExt can be moved after the
4047/// instruction. Move ZExt if it is profitable. For example:
4048/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4049/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4050/// Cost model calculations takes into account if zext(x) has other users and
4051/// whether it can be propagated through them too.
4052bool VectorCombine::shrinkType(Instruction &I) {
4053 Value *ZExted, *OtherOperand;
4054 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4055 m_Value(OtherOperand))) &&
4056 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4057 return false;
4058
4059 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4060
4061 auto *BigTy = cast<FixedVectorType>(I.getType());
4062 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4063 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4064
4065 if (I.getOpcode() == Instruction::LShr) {
4066 // Check that the shift amount is less than the number of bits in the
4067 // smaller type. Otherwise, the smaller lshr will return a poison value.
4068 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4069 if (ShAmtKB.getMaxValue().uge(BW))
4070 return false;
4071 } else {
4072 // Check that the expression overall uses at most the same number of bits as
4073 // ZExted
4074 KnownBits KB = computeKnownBits(&I, *DL);
4075 if (KB.countMaxActiveBits() > BW)
4076 return false;
4077 }
4078
4079 // Calculate costs of leaving current IR as it is and moving ZExt operation
4080 // later, along with adding truncates if needed
4082 Instruction::ZExt, BigTy, SmallTy,
4083 TargetTransformInfo::CastContextHint::None, CostKind);
4084 InstructionCost CurrentCost = ZExtCost;
4085 InstructionCost ShrinkCost = 0;
4086
4087 // Calculate total cost and check that we can propagate through all ZExt users
4088 for (User *U : ZExtOperand->users()) {
4089 auto *UI = cast<Instruction>(U);
4090 if (UI == &I) {
4091 CurrentCost +=
4092 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4093 ShrinkCost +=
4094 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4095 ShrinkCost += ZExtCost;
4096 continue;
4097 }
4098
4099 if (!Instruction::isBinaryOp(UI->getOpcode()))
4100 return false;
4101
4102 // Check if we can propagate ZExt through its other users
4103 KnownBits KB = computeKnownBits(UI, *DL);
4104 if (KB.countMaxActiveBits() > BW)
4105 return false;
4106
4107 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4108 ShrinkCost +=
4109 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4110 ShrinkCost += ZExtCost;
4111 }
4112
4113 // If the other instruction operand is not a constant, we'll need to
4114 // generate a truncate instruction. So we have to adjust cost
4115 if (!isa<Constant>(OtherOperand))
4116 ShrinkCost += TTI.getCastInstrCost(
4117 Instruction::Trunc, SmallTy, BigTy,
4118 TargetTransformInfo::CastContextHint::None, CostKind);
4119
4120 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4121 // towards modifying the IR because shrinking opens opportunities for other
4122 // shrinking optimisations.
4123 if (ShrinkCost > CurrentCost)
4124 return false;
4125
4126 Builder.SetInsertPoint(&I);
4127 Value *Op0 = ZExted;
4128 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4129 // Keep the order of operands the same
4130 if (I.getOperand(0) == OtherOperand)
4131 std::swap(Op0, Op1);
4132 Value *NewBinOp =
4133 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4134 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4135 cast<Instruction>(NewBinOp)->copyMetadata(I);
4136 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4137 replaceValue(I, *NewZExtr);
4138 return true;
4139}
4140
4141/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4142/// shuffle (DstVec, SrcVec, Mask)
4143bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4144 Value *DstVec, *SrcVec;
4145 uint64_t ExtIdx, InsIdx;
4146 if (!match(&I,
4147 m_InsertElt(m_Value(DstVec),
4148 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4149 m_ConstantInt(InsIdx))))
4150 return false;
4151
4152 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4153 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4154 // We can try combining vectors with different element sizes.
4155 if (!DstVecTy || !SrcVecTy ||
4156 SrcVecTy->getElementType() != DstVecTy->getElementType())
4157 return false;
4158
4159 unsigned NumDstElts = DstVecTy->getNumElements();
4160 unsigned NumSrcElts = SrcVecTy->getNumElements();
4161 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4162 return false;
4163
4164 // Insertion into poison is a cheaper single operand shuffle.
4167
4168 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4169 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4170 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4171 if (NeedDstSrcSwap) {
4173 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4174 Mask[InsIdx] = 0;
4175 else
4176 Mask[InsIdx] = ExtIdx;
4177 std::swap(DstVec, SrcVec);
4178 } else {
4180 std::iota(Mask.begin(), Mask.end(), 0);
4181 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4182 Mask[InsIdx] = NumDstElts;
4183 else
4184 Mask[InsIdx] = ExtIdx + NumDstElts;
4185 }
4186
4187 // Cost
4188 auto *Ins = cast<InsertElementInst>(&I);
4189 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4190 InstructionCost InsCost =
4191 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4192 InstructionCost ExtCost =
4193 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4194 InstructionCost OldCost = ExtCost + InsCost;
4195
4196 InstructionCost NewCost = 0;
4197 SmallVector<int> ExtToVecMask;
4198 if (!NeedExpOrNarrow) {
4199 // Ignore 'free' identity insertion shuffle.
4200 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4201 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4202 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4203 nullptr, {DstVec, SrcVec});
4204 } else {
4205 // When creating length-changing-vector, always create with a Mask whose
4206 // first element has an ExtIdx, so that the first element of the vector
4207 // being created is always the target to be extracted.
4208 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4209 if (IsExtIdxInBounds)
4210 ExtToVecMask[ExtIdx] = ExtIdx;
4211 else
4212 ExtToVecMask[0] = ExtIdx;
4213 // Add cost for expanding or narrowing
4215 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4216 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4217 }
4218
4219 if (!Ext->hasOneUse())
4220 NewCost += ExtCost;
4221
4222 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4223 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4224 << "\n");
4225
4226 if (OldCost < NewCost)
4227 return false;
4228
4229 if (NeedExpOrNarrow) {
4230 if (!NeedDstSrcSwap)
4231 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4232 else
4233 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4234 }
4235
4236 // Canonicalize undef param to RHS to help further folds.
4237 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4238 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4239 std::swap(DstVec, SrcVec);
4240 }
4241
4242 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4243 replaceValue(I, *Shuf);
4244
4245 return true;
4246}
4247
4248/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4249/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4250/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4251/// before casting it back into `<vscale x 16 x i32>`.
4252bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4253 const APInt *SplatVal0, *SplatVal1;
4254 if (!match(&I, m_Intrinsic<Intrinsic::vector_interleave2>(
4255 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4256 return false;
4257
4258 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4259 << "\n");
4260
4261 auto *VTy =
4262 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4263 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4264 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4265
4266 // Just in case the cost of interleave2 intrinsic and bitcast are both
4267 // invalid, in which case we want to bail out, we use <= rather
4268 // than < here. Even they both have valid and equal costs, it's probably
4269 // not a good idea to emit a high-cost constant splat.
4271 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4273 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4274 << *I.getType() << " is too high.\n");
4275 return false;
4276 }
4277
4278 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4279 NewSplatVal <<= Width;
4280 NewSplatVal |= SplatVal0->zext(Width * 2);
4281 auto *NewSplat = ConstantVector::getSplat(
4282 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4283
4284 IRBuilder<> Builder(&I);
4285 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4286 return true;
4287}
4288
4289// Attempt to shrink loads that are only used by shufflevector instructions.
4290bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4291 auto *OldLoad = dyn_cast<LoadInst>(&I);
4292 if (!OldLoad || !OldLoad->isSimple())
4293 return false;
4294
4295 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4296 if (!OldLoadTy)
4297 return false;
4298
4299 unsigned const OldNumElements = OldLoadTy->getNumElements();
4300
4301 // Search all uses of load. If all uses are shufflevector instructions, and
4302 // the second operands are all poison values, find the minimum and maximum
4303 // indices of the vector elements referenced by all shuffle masks.
4304 // Otherwise return `std::nullopt`.
4305 using IndexRange = std::pair<int, int>;
4306 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4307 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4308 for (llvm::Use &Use : I.uses()) {
4309 // Ensure all uses match the required pattern.
4310 User *Shuffle = Use.getUser();
4312
4313 if (!match(Shuffle,
4314 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4315 return std::nullopt;
4316
4317 // Ignore shufflevector instructions that have no uses.
4318 if (Shuffle->use_empty())
4319 continue;
4320
4321 // Find the min and max indices used by the shufflevector instruction.
4322 for (int Index : Mask) {
4323 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4324 OutputRange.first = std::min(Index, OutputRange.first);
4325 OutputRange.second = std::max(Index, OutputRange.second);
4326 }
4327 }
4328 }
4329
4330 if (OutputRange.second < OutputRange.first)
4331 return std::nullopt;
4332
4333 return OutputRange;
4334 };
4335
4336 // Get the range of vector elements used by shufflevector instructions.
4337 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4338 unsigned const NewNumElements = Indices->second + 1u;
4339
4340 // If the range of vector elements is smaller than the full load, attempt
4341 // to create a smaller load.
4342 if (NewNumElements < OldNumElements) {
4343 IRBuilder Builder(&I);
4344 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4345
4346 // Calculate costs of old and new ops.
4347 Type *ElemTy = OldLoadTy->getElementType();
4348 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4349 Value *PtrOp = OldLoad->getPointerOperand();
4350
4352 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4353 OldLoad->getPointerAddressSpace(), CostKind);
4354 InstructionCost NewCost =
4355 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4356 OldLoad->getPointerAddressSpace(), CostKind);
4357
4358 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4360 unsigned const MaxIndex = NewNumElements * 2u;
4361
4362 for (llvm::Use &Use : I.uses()) {
4363 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4364 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4365
4366 // Create entry for new use.
4367 NewUses.push_back({Shuffle, OldMask});
4368
4369 // Validate mask indices.
4370 for (int Index : OldMask) {
4371 if (Index >= static_cast<int>(MaxIndex))
4372 return false;
4373 }
4374
4375 // Update costs.
4376 OldCost +=
4378 OldLoadTy, OldMask, CostKind);
4379 NewCost +=
4381 NewLoadTy, OldMask, CostKind);
4382 }
4383
4384 LLVM_DEBUG(
4385 dbgs() << "Found a load used only by shufflevector instructions: "
4386 << I << "\n OldCost: " << OldCost
4387 << " vs NewCost: " << NewCost << "\n");
4388
4389 if (OldCost < NewCost || !NewCost.isValid())
4390 return false;
4391
4392 // Create new load of smaller vector.
4393 auto *NewLoad = cast<LoadInst>(
4394 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4395 NewLoad->copyMetadata(I);
4396
4397 // Replace all uses.
4398 for (UseEntry &Use : NewUses) {
4399 ShuffleVectorInst *Shuffle = Use.first;
4400 std::vector<int> &NewMask = Use.second;
4401
4402 Builder.SetInsertPoint(Shuffle);
4403 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4404 Value *NewShuffle = Builder.CreateShuffleVector(
4405 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4406
4407 replaceValue(*Shuffle, *NewShuffle, false);
4408 }
4409
4410 return true;
4411 }
4412 }
4413 return false;
4414}
4415
4416// Attempt to narrow a phi of shufflevector instructions where the two incoming
4417// values have the same operands but different masks. If the two shuffle masks
4418// are offsets of one another we can use one branch to rotate the incoming
4419// vector and perform one larger shuffle after the phi.
4420bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4421 auto *Phi = dyn_cast<PHINode>(&I);
4422 if (!Phi || Phi->getNumIncomingValues() != 2u)
4423 return false;
4424
4425 Value *Op = nullptr;
4426 ArrayRef<int> Mask0;
4427 ArrayRef<int> Mask1;
4428
4429 if (!match(Phi->getOperand(0u),
4430 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4431 !match(Phi->getOperand(1u),
4433 return false;
4434
4435 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4436
4437 // Ensure result vectors are wider than the argument vector.
4438 auto *InputVT = cast<FixedVectorType>(Op->getType());
4439 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4440 auto const InputNumElements = InputVT->getNumElements();
4441
4442 if (InputNumElements >= ResultVT->getNumElements())
4443 return false;
4444
4445 // Take the difference of the two shuffle masks at each index. Ignore poison
4446 // values at the same index in both masks.
4447 SmallVector<int, 16> NewMask;
4448 NewMask.reserve(Mask0.size());
4449
4450 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4451 if (M0 >= 0 && M1 >= 0)
4452 NewMask.push_back(M0 - M1);
4453 else if (M0 == -1 && M1 == -1)
4454 continue;
4455 else
4456 return false;
4457 }
4458
4459 // Ensure all elements of the new mask are equal. If the difference between
4460 // the incoming mask elements is the same, the two must be constant offsets
4461 // of one another.
4462 if (NewMask.empty() || !all_equal(NewMask))
4463 return false;
4464
4465 // Create new mask using difference of the two incoming masks.
4466 int MaskOffset = NewMask[0u];
4467 unsigned Index = (InputNumElements - MaskOffset) % InputNumElements;
4468 NewMask.clear();
4469
4470 for (unsigned I = 0u; I < InputNumElements; ++I) {
4471 NewMask.push_back(Index);
4472 Index = (Index + 1u) % InputNumElements;
4473 }
4474
4475 // Calculate costs for worst cases and compare.
4476 auto const Kind = TTI::SK_PermuteSingleSrc;
4477 auto OldCost =
4478 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4479 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4480 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4481 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4482
4483 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4484 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4485 << "\n");
4486
4487 if (NewCost > OldCost)
4488 return false;
4489
4490 // Create new shuffles and narrowed phi.
4491 auto Builder = IRBuilder(Shuf);
4492 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4493 auto *PoisonVal = PoisonValue::get(InputVT);
4494 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4495 Worklist.push(cast<Instruction>(NewShuf0));
4496
4497 Builder.SetInsertPoint(Phi);
4498 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4499 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4500 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4501 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4502
4503 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4504 PoisonVal = PoisonValue::get(NewPhi->getType());
4505 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4506
4507 replaceValue(*Phi, *NewShuf1);
4508 return true;
4509}
4510
4511/// This is the entry point for all transforms. Pass manager differences are
4512/// handled in the callers of this function.
4513bool VectorCombine::run() {
4515 return false;
4516
4517 // Don't attempt vectorization if the target does not support vectors.
4518 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4519 return false;
4520
4521 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4522
4523 auto FoldInst = [this](Instruction &I) {
4524 Builder.SetInsertPoint(&I);
4525 bool IsVectorType = isa<VectorType>(I.getType());
4526 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4527 auto Opcode = I.getOpcode();
4528
4529 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4530
4531 // These folds should be beneficial regardless of when this pass is run
4532 // in the optimization pipeline.
4533 // The type checking is for run-time efficiency. We can avoid wasting time
4534 // dispatching to folding functions if there's no chance of matching.
4535 if (IsFixedVectorType) {
4536 switch (Opcode) {
4537 case Instruction::InsertElement:
4538 if (vectorizeLoadInsert(I))
4539 return true;
4540 break;
4541 case Instruction::ShuffleVector:
4542 if (widenSubvectorLoad(I))
4543 return true;
4544 break;
4545 default:
4546 break;
4547 }
4548 }
4549
4550 // This transform works with scalable and fixed vectors
4551 // TODO: Identify and allow other scalable transforms
4552 if (IsVectorType) {
4553 if (scalarizeOpOrCmp(I))
4554 return true;
4555 if (scalarizeLoadExtract(I))
4556 return true;
4557 if (scalarizeExtExtract(I))
4558 return true;
4559 if (scalarizeVPIntrinsic(I))
4560 return true;
4561 if (foldInterleaveIntrinsics(I))
4562 return true;
4563 }
4564
4565 if (Opcode == Instruction::Store)
4566 if (foldSingleElementStore(I))
4567 return true;
4568
4569 // If this is an early pipeline invocation of this pass, we are done.
4570 if (TryEarlyFoldsOnly)
4571 return false;
4572
4573 // Otherwise, try folds that improve codegen but may interfere with
4574 // early IR canonicalizations.
4575 // The type checking is for run-time efficiency. We can avoid wasting time
4576 // dispatching to folding functions if there's no chance of matching.
4577 if (IsFixedVectorType) {
4578 switch (Opcode) {
4579 case Instruction::InsertElement:
4580 if (foldInsExtFNeg(I))
4581 return true;
4582 if (foldInsExtBinop(I))
4583 return true;
4584 if (foldInsExtVectorToShuffle(I))
4585 return true;
4586 break;
4587 case Instruction::ShuffleVector:
4588 if (foldPermuteOfBinops(I))
4589 return true;
4590 if (foldShuffleOfBinops(I))
4591 return true;
4592 if (foldShuffleOfSelects(I))
4593 return true;
4594 if (foldShuffleOfCastops(I))
4595 return true;
4596 if (foldShuffleOfShuffles(I))
4597 return true;
4598 if (foldShuffleOfIntrinsics(I))
4599 return true;
4600 if (foldSelectShuffle(I))
4601 return true;
4602 if (foldShuffleToIdentity(I))
4603 return true;
4604 break;
4605 case Instruction::Load:
4606 if (shrinkLoadForShuffles(I))
4607 return true;
4608 break;
4609 case Instruction::BitCast:
4610 if (foldBitcastShuffle(I))
4611 return true;
4612 break;
4613 case Instruction::And:
4614 case Instruction::Or:
4615 case Instruction::Xor:
4616 if (foldBitOpOfCastops(I))
4617 return true;
4618 if (foldBitOpOfCastConstant(I))
4619 return true;
4620 break;
4621 case Instruction::PHI:
4622 if (shrinkPhiOfShuffles(I))
4623 return true;
4624 break;
4625 default:
4626 if (shrinkType(I))
4627 return true;
4628 break;
4629 }
4630 } else {
4631 switch (Opcode) {
4632 case Instruction::Call:
4633 if (foldShuffleFromReductions(I))
4634 return true;
4635 if (foldCastFromReductions(I))
4636 return true;
4637 break;
4638 case Instruction::ExtractElement:
4639 if (foldShuffleChainsToReduce(I))
4640 return true;
4641 break;
4642 case Instruction::ICmp:
4643 case Instruction::FCmp:
4644 if (foldExtractExtract(I))
4645 return true;
4646 break;
4647 case Instruction::Or:
4648 if (foldConcatOfBoolMasks(I))
4649 return true;
4650 [[fallthrough]];
4651 default:
4652 if (Instruction::isBinaryOp(Opcode)) {
4653 if (foldExtractExtract(I))
4654 return true;
4655 if (foldExtractedCmps(I))
4656 return true;
4657 if (foldBinopOfReductions(I))
4658 return true;
4659 }
4660 break;
4661 }
4662 }
4663 return false;
4664 };
4665
4666 bool MadeChange = false;
4667 for (BasicBlock &BB : F) {
4668 // Ignore unreachable basic blocks.
4669 if (!DT.isReachableFromEntry(&BB))
4670 continue;
4671 // Use early increment range so that we can erase instructions in loop.
4672 // make_early_inc_range is not applicable here, as the next iterator may
4673 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4674 // We manually maintain the next instruction and update it when it is about
4675 // to be deleted.
4676 Instruction *I = &BB.front();
4677 while (I) {
4678 NextInst = I->getNextNode();
4679 if (!I->isDebugOrPseudoInst())
4680 MadeChange |= FoldInst(*I);
4681 I = NextInst;
4682 }
4683 }
4684
4685 NextInst = nullptr;
4686
4687 while (!Worklist.isEmpty()) {
4688 Instruction *I = Worklist.removeOne();
4689 if (!I)
4690 continue;
4691
4694 continue;
4695 }
4696
4697 MadeChange |= FoldInst(*I);
4698 }
4699
4700 return MadeChange;
4701}
4702
4705 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4709 const DataLayout *DL = &F.getDataLayout();
4710 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4711 TryEarlyFoldsOnly);
4712 if (!Combiner.run())
4713 return PreservedAnalyses::all();
4716 return PA;
4717}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseMap class.
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1328
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1451
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
#define LLVM_DEBUG(...)
Definition: Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Value * peekThroughBitcasts(Value *V)
Return the source operand of a potentially bitcasted value.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, Instruction::CastOps CastOp, const DataLayout &DL, PreservedCastFlags &Flags)
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
A private abstract base class describing the concept of an individual alias analysis implementation.
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Check whether or not an instruction may read or write the optionally specified memory location.
Class for arbitrary precision integers.
Definition: APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1221
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:255
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:412
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:150
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:147
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
LLVM Basic Block Representation.
Definition: BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:170
BinaryOps getOpcode() const
Definition: InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:73
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1292
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1283
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:984
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:678
bool isFPPredicate() const
Definition: InstrTypes.h:784
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:23
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition: Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
Definition: Constants.cpp:2564
static LLVM_ABI Constant * getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2272
This is the shared class of boolean and integer constants.
Definition: Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:154
This class represents a range of values.
Definition: ConstantRange.h:47
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1474
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1423
This is an important base class in LLVM.
Definition: Constant.h:43
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:165
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:229
iterator end()
Definition: DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:334
This instruction extracts a single (scalar) element from a VectorType value.
This class represents a cast from floating point to signed integer.
This class represents a cast from floating point to unsigned integer.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:22
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:592
unsigned getNumElements() const
Definition: DerivedTypes.h:635
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
Definition: DerivedTypes.h:627
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:803
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2571
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2559
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1864
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1010
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1115
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1005
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2637
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2238
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1931
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2263
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:527
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1809
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:834
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2463
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2494
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2204
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1847
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Definition: IRBuilder.cpp:922
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2082
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2593
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1860
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2068
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1795
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition: IRBuilder.h:1573
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2780
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
InstructionWorklist - This is the worklist management logic for InstCombine and other simplification ...
void pushUsersToWorkList(Instruction &I)
When an instruction is simplified, add all users of the instruction to the work lists because they mi...
void push(Instruction *I)
Push the instruction onto the worklist stack.
void remove(Instruction *I)
Remove I from the worklist if it exists.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:513
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
Definition: Instruction.h:317
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:312
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
Definition: Instruction.h:318
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:49
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:56
An instruction for reading from memory.
Definition: Instructions.h:180
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:303
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1885
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:151
This class represents a sign extension of integer types.
This class represents a cast from signed integer to floating point.
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition: SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:380
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:401
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:476
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:541
bool empty() const
Definition: SmallVector.h:82
size_t size() const
Definition: SmallVector.h:79
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:705
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:938
void reserve(size_type N)
Definition: SmallVector.h:664
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:684
void push_back(const T &Elt)
Definition: SmallVector.h:414
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1197
An instruction for storing to memory.
Definition: Instructions.h:296
void setAlignment(Align Align)
Definition: Instructions.h:342
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add/...
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:267
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:240
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:352
This class represents a cast unsigned integer to floating point.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition: Use.h:61
op_range operands()
Definition: User.h:292
Value * getOperand(unsigned i) const
Definition: User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
This is the common base class for vector predication intrinsics.
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition: Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition: Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:546
iterator_range< user_iterator > users()
Definition: Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:150
bool use_empty() const
Definition: Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:322
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
This class represents zero extension of integer types.
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:359
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:126
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
Definition: Intrinsics.cpp:743
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
Definition: PatternMatch.h:160
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
Definition: PatternMatch.h:165
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:862
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:962
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
Definition: PatternMatch.h:245
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:444
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:338
@ Offset
Definition: DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:860
void stable_sort(R &&Range)
Definition: STLExtras.h:2077
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1737
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1744
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2491
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:355
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:975
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:252
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:663
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition: Loads.cpp:423
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition: VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1751
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:288
bool isModSet(const ModRefInfo MRI)
Definition: ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1669
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition: Loads.cpp:438
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:207
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1393
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
Definition: LoopUtils.cpp:1005
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:376
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:223
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1777
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1916
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2127
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
Definition: LoopUtils.cpp:959
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition: Metadata.h:760
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:289
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:138
Matching combinators.