LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldSelectsFromBitcast(Instruction &I);
131 bool foldBinopOfReductions(Instruction &I);
132 bool foldSingleElementStore(Instruction &I);
133 bool scalarizeLoad(Instruction &I);
134 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
136 bool scalarizeExtExtract(Instruction &I);
137 bool foldConcatOfBoolMasks(Instruction &I);
138 bool foldPermuteOfBinops(Instruction &I);
139 bool foldShuffleOfBinops(Instruction &I);
140 bool foldShuffleOfSelects(Instruction &I);
141 bool foldShuffleOfCastops(Instruction &I);
142 bool foldShuffleOfShuffles(Instruction &I);
143 bool foldPermuteOfIntrinsic(Instruction &I);
144 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
145 bool foldShuffleOfIntrinsics(Instruction &I);
146 bool foldShuffleToIdentity(Instruction &I);
147 bool foldShuffleFromReductions(Instruction &I);
148 bool foldShuffleChainsToReduce(Instruction &I);
149 bool foldCastFromReductions(Instruction &I);
150 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
151 bool foldInterleaveIntrinsics(Instruction &I);
152 bool shrinkType(Instruction &I);
153 bool shrinkLoadForShuffles(Instruction &I);
154 bool shrinkPhiOfShuffles(Instruction &I);
155
156 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
157 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
158 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
159 Old.replaceAllUsesWith(&New);
160 if (auto *NewI = dyn_cast<Instruction>(&New)) {
161 New.takeName(&Old);
162 Worklist.pushUsersToWorkList(*NewI);
163 Worklist.pushValue(NewI);
164 }
165 if (Erase && isInstructionTriviallyDead(&Old)) {
166 eraseInstruction(Old);
167 } else {
168 Worklist.push(&Old);
169 }
170 }
171
172 void eraseInstruction(Instruction &I) {
173 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
174 SmallVector<Value *> Ops(I.operands());
175 Worklist.remove(&I);
176 I.eraseFromParent();
177
178 // Push remaining users of the operands and then the operand itself - allows
179 // further folds that were hindered by OneUse limits.
180 SmallPtrSet<Value *, 4> Visited;
181 for (Value *Op : Ops) {
182 if (!Visited.contains(Op)) {
183 if (auto *OpI = dyn_cast<Instruction>(Op)) {
185 OpI, nullptr, nullptr, [&](Value *V) {
186 if (auto *I = dyn_cast<Instruction>(V)) {
187 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
188 Worklist.remove(I);
189 if (I == NextInst)
190 NextInst = NextInst->getNextNode();
191 Visited.insert(I);
192 }
193 }))
194 continue;
195 Worklist.pushUsersToWorkList(*OpI);
196 Worklist.pushValue(OpI);
197 }
198 }
199 }
200 }
201};
202} // namespace
203
204/// Return the source operand of a potentially bitcasted value. If there is no
205/// bitcast, return the input value itself.
207 while (auto *BitCast = dyn_cast<BitCastInst>(V))
208 V = BitCast->getOperand(0);
209 return V;
210}
211
212static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
213 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
214 // The widened load may load data from dirty regions or create data races
215 // non-existent in the source.
216 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
217 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
219 return false;
220
221 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
222 // sure we have all of our type-based constraints in place for this target.
223 Type *ScalarTy = Load->getType()->getScalarType();
224 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
225 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
226 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
227 ScalarSize % 8 != 0)
228 return false;
229
230 return true;
231}
232
233bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
234 // Match insert into fixed vector of scalar value.
235 // TODO: Handle non-zero insert index.
236 Value *Scalar;
237 if (!match(&I,
239 return false;
240
241 // Optionally match an extract from another vector.
242 Value *X;
243 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
244 if (!HasExtract)
245 X = Scalar;
246
247 auto *Load = dyn_cast<LoadInst>(X);
248 if (!canWidenLoad(Load, TTI))
249 return false;
250
251 Type *ScalarTy = Scalar->getType();
252 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
253 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
254
255 // Check safety of replacing the scalar load with a larger vector load.
256 // We use minimal alignment (maximum flexibility) because we only care about
257 // the dereferenceable region. When calculating cost and creating a new op,
258 // we may use a larger value based on alignment attributes.
259 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
260 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
261
262 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
263 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
264 unsigned OffsetEltIndex = 0;
265 Align Alignment = Load->getAlign();
266 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
267 &DT)) {
268 // It is not safe to load directly from the pointer, but we can still peek
269 // through gep offsets and check if it safe to load from a base address with
270 // updated alignment. If it is, we can shuffle the element(s) into place
271 // after loading.
272 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
273 APInt Offset(OffsetBitWidth, 0);
275
276 // We want to shuffle the result down from a high element of a vector, so
277 // the offset must be positive.
278 if (Offset.isNegative())
279 return false;
280
281 // The offset must be a multiple of the scalar element to shuffle cleanly
282 // in the element's size.
283 uint64_t ScalarSizeInBytes = ScalarSize / 8;
284 if (Offset.urem(ScalarSizeInBytes) != 0)
285 return false;
286
287 // If we load MinVecNumElts, will our target element still be loaded?
288 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
289 if (OffsetEltIndex >= MinVecNumElts)
290 return false;
291
292 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
293 &DT))
294 return false;
295
296 // Update alignment with offset value. Note that the offset could be negated
297 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
298 // negation does not change the result of the alignment calculation.
299 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
300 }
301
302 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
303 // Use the greater of the alignment on the load or its source pointer.
304 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
305 Type *LoadTy = Load->getType();
306 unsigned AS = Load->getPointerAddressSpace();
307 InstructionCost OldCost =
308 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
309 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
310 OldCost +=
311 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
312 /* Insert */ true, HasExtract, CostKind);
313
314 // New pattern: load VecPtr
315 InstructionCost NewCost =
316 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
317 // Optionally, we are shuffling the loaded vector element(s) into place.
318 // For the mask set everything but element 0 to undef to prevent poison from
319 // propagating from the extra loaded memory. This will also optionally
320 // shrink/grow the vector from the loaded size to the output size.
321 // We assume this operation has no cost in codegen if there was no offset.
322 // Note that we could use freeze to avoid poison problems, but then we might
323 // still need a shuffle to change the vector size.
324 auto *Ty = cast<FixedVectorType>(I.getType());
325 unsigned OutputNumElts = Ty->getNumElements();
326 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
327 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
328 Mask[0] = OffsetEltIndex;
329 if (OffsetEltIndex)
330 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
331 CostKind);
332
333 // We can aggressively convert to the vector form because the backend can
334 // invert this transform if it does not result in a performance win.
335 if (OldCost < NewCost || !NewCost.isValid())
336 return false;
337
338 // It is safe and potentially profitable to load a vector directly:
339 // inselt undef, load Scalar, 0 --> load VecPtr
340 IRBuilder<> Builder(Load);
341 Value *CastedPtr =
342 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
343 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
344 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
345
346 replaceValue(I, *VecLd);
347 ++NumVecLoad;
348 return true;
349}
350
351/// If we are loading a vector and then inserting it into a larger vector with
352/// undefined elements, try to load the larger vector and eliminate the insert.
353/// This removes a shuffle in IR and may allow combining of other loaded values.
354bool VectorCombine::widenSubvectorLoad(Instruction &I) {
355 // Match subvector insert of fixed vector.
356 auto *Shuf = cast<ShuffleVectorInst>(&I);
357 if (!Shuf->isIdentityWithPadding())
358 return false;
359
360 // Allow a non-canonical shuffle mask that is choosing elements from op1.
361 unsigned NumOpElts =
362 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
363 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
364 return M >= (int)(NumOpElts);
365 });
366
367 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
368 if (!canWidenLoad(Load, TTI))
369 return false;
370
371 // We use minimal alignment (maximum flexibility) because we only care about
372 // the dereferenceable region. When calculating cost and creating a new op,
373 // we may use a larger value based on alignment attributes.
374 auto *Ty = cast<FixedVectorType>(I.getType());
375 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
376 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
377 Align Alignment = Load->getAlign();
378 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
379 return false;
380
381 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
382 Type *LoadTy = Load->getType();
383 unsigned AS = Load->getPointerAddressSpace();
384
385 // Original pattern: insert_subvector (load PtrOp)
386 // This conservatively assumes that the cost of a subvector insert into an
387 // undef value is 0. We could add that cost if the cost model accurately
388 // reflects the real cost of that operation.
389 InstructionCost OldCost =
390 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
391
392 // New pattern: load PtrOp
393 InstructionCost NewCost =
394 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
395
396 // We can aggressively convert to the vector form because the backend can
397 // invert this transform if it does not result in a performance win.
398 if (OldCost < NewCost || !NewCost.isValid())
399 return false;
400
401 IRBuilder<> Builder(Load);
402 Value *CastedPtr =
403 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
404 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
405 replaceValue(I, *VecLd);
406 ++NumVecLoad;
407 return true;
408}
409
410/// Determine which, if any, of the inputs should be replaced by a shuffle
411/// followed by extract from a different index.
412ExtractElementInst *VectorCombine::getShuffleExtract(
413 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
414 unsigned PreferredExtractIndex = InvalidIndex) const {
415 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
416 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
417 assert(Index0C && Index1C && "Expected constant extract indexes");
418
419 unsigned Index0 = Index0C->getZExtValue();
420 unsigned Index1 = Index1C->getZExtValue();
421
422 // If the extract indexes are identical, no shuffle is needed.
423 if (Index0 == Index1)
424 return nullptr;
425
426 Type *VecTy = Ext0->getVectorOperand()->getType();
427 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
428 InstructionCost Cost0 =
429 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
430 InstructionCost Cost1 =
431 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
432
433 // If both costs are invalid no shuffle is needed
434 if (!Cost0.isValid() && !Cost1.isValid())
435 return nullptr;
436
437 // We are extracting from 2 different indexes, so one operand must be shuffled
438 // before performing a vector operation and/or extract. The more expensive
439 // extract will be replaced by a shuffle.
440 if (Cost0 > Cost1)
441 return Ext0;
442 if (Cost1 > Cost0)
443 return Ext1;
444
445 // If the costs are equal and there is a preferred extract index, shuffle the
446 // opposite operand.
447 if (PreferredExtractIndex == Index0)
448 return Ext1;
449 if (PreferredExtractIndex == Index1)
450 return Ext0;
451
452 // Otherwise, replace the extract with the higher index.
453 return Index0 > Index1 ? Ext0 : Ext1;
454}
455
456/// Compare the relative costs of 2 extracts followed by scalar operation vs.
457/// vector operation(s) followed by extract. Return true if the existing
458/// instructions are cheaper than a vector alternative. Otherwise, return false
459/// and if one of the extracts should be transformed to a shufflevector, set
460/// \p ConvertToShuffle to that extract instruction.
461bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
462 ExtractElementInst *Ext1,
463 const Instruction &I,
464 ExtractElementInst *&ConvertToShuffle,
465 unsigned PreferredExtractIndex) {
466 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
467 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
468 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
469
470 unsigned Opcode = I.getOpcode();
471 Value *Ext0Src = Ext0->getVectorOperand();
472 Value *Ext1Src = Ext1->getVectorOperand();
473 Type *ScalarTy = Ext0->getType();
474 auto *VecTy = cast<VectorType>(Ext0Src->getType());
475 InstructionCost ScalarOpCost, VectorOpCost;
476
477 // Get cost estimates for scalar and vector versions of the operation.
478 bool IsBinOp = Instruction::isBinaryOp(Opcode);
479 if (IsBinOp) {
480 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
481 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
482 } else {
483 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
484 "Expected a compare");
485 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
486 ScalarOpCost = TTI.getCmpSelInstrCost(
487 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
488 VectorOpCost = TTI.getCmpSelInstrCost(
489 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
490 }
491
492 // Get cost estimates for the extract elements. These costs will factor into
493 // both sequences.
494 unsigned Ext0Index = Ext0IndexC->getZExtValue();
495 unsigned Ext1Index = Ext1IndexC->getZExtValue();
496
497 InstructionCost Extract0Cost =
498 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
499 InstructionCost Extract1Cost =
500 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
501
502 // A more expensive extract will always be replaced by a splat shuffle.
503 // For example, if Ext0 is more expensive:
504 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
505 // extelt (opcode (splat V0, Ext0), V1), Ext1
506 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
507 // check the cost of creating a broadcast shuffle and shuffling both
508 // operands to element 0.
509 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
510 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
511 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
512
513 // Extra uses of the extracts mean that we include those costs in the
514 // vector total because those instructions will not be eliminated.
515 InstructionCost OldCost, NewCost;
516 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
517 // Handle a special case. If the 2 extracts are identical, adjust the
518 // formulas to account for that. The extra use charge allows for either the
519 // CSE'd pattern or an unoptimized form with identical values:
520 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
521 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
522 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
523 OldCost = CheapExtractCost + ScalarOpCost;
524 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
525 } else {
526 // Handle the general case. Each extract is actually a different value:
527 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
528 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
529 NewCost = VectorOpCost + CheapExtractCost +
530 !Ext0->hasOneUse() * Extract0Cost +
531 !Ext1->hasOneUse() * Extract1Cost;
532 }
533
534 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
535 if (ConvertToShuffle) {
536 if (IsBinOp && DisableBinopExtractShuffle)
537 return true;
538
539 // If we are extracting from 2 different indexes, then one operand must be
540 // shuffled before performing the vector operation. The shuffle mask is
541 // poison except for 1 lane that is being translated to the remaining
542 // extraction lane. Therefore, it is a splat shuffle. Ex:
543 // ShufMask = { poison, poison, 0, poison }
544 // TODO: The cost model has an option for a "broadcast" shuffle
545 // (splat-from-element-0), but no option for a more general splat.
546 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
547 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
549 ShuffleMask[BestInsIndex] = BestExtIndex;
551 VecTy, VecTy, ShuffleMask, CostKind, 0,
552 nullptr, {ConvertToShuffle});
553 } else {
555 VecTy, VecTy, {}, CostKind, 0, nullptr,
556 {ConvertToShuffle});
557 }
558 }
559
560 // Aggressively form a vector op if the cost is equal because the transform
561 // may enable further optimization.
562 // Codegen can reverse this transform (scalarize) if it was not profitable.
563 return OldCost < NewCost;
564}
565
566/// Create a shuffle that translates (shifts) 1 element from the input vector
567/// to a new element location.
568static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
569 unsigned NewIndex, IRBuilderBase &Builder) {
570 // The shuffle mask is poison except for 1 lane that is being translated
571 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
572 // ShufMask = { 2, poison, poison, poison }
573 auto *VecTy = cast<FixedVectorType>(Vec->getType());
574 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
575 ShufMask[NewIndex] = OldIndex;
576 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
577}
578
579/// Given an extract element instruction with constant index operand, shuffle
580/// the source vector (shift the scalar element) to a NewIndex for extraction.
581/// Return null if the input can be constant folded, so that we are not creating
582/// unnecessary instructions.
583static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
584 IRBuilderBase &Builder) {
585 // Shufflevectors can only be created for fixed-width vectors.
586 Value *X = ExtElt->getVectorOperand();
587 if (!isa<FixedVectorType>(X->getType()))
588 return nullptr;
589
590 // If the extract can be constant-folded, this code is unsimplified. Defer
591 // to other passes to handle that.
592 Value *C = ExtElt->getIndexOperand();
593 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
594 if (isa<Constant>(X))
595 return nullptr;
596
597 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
598 NewIndex, Builder);
599 return Shuf;
600}
601
602/// Try to reduce extract element costs by converting scalar compares to vector
603/// compares followed by extract.
604/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
605Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
606 Instruction &I) {
607 assert(isa<CmpInst>(&I) && "Expected a compare");
608
609 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
610 // --> extelt (cmp Pred V0, V1), ExtIndex
611 ++NumVecCmp;
612 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
613 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
614 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
615}
616
617/// Try to reduce extract element costs by converting scalar binops to vector
618/// binops followed by extract.
619/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
620Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
621 Instruction &I) {
622 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
623
624 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
625 // --> extelt (bo V0, V1), ExtIndex
626 ++NumVecBO;
627 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
628 V1, "foldExtExtBinop");
629
630 // All IR flags are safe to back-propagate because any potential poison
631 // created in unused vector elements is discarded by the extract.
632 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
633 VecBOInst->copyIRFlags(&I);
634
635 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
636}
637
638/// Match an instruction with extracted vector operands.
639bool VectorCombine::foldExtractExtract(Instruction &I) {
640 // It is not safe to transform things like div, urem, etc. because we may
641 // create undefined behavior when executing those on unknown vector elements.
643 return false;
644
645 Instruction *I0, *I1;
646 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
647 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
649 return false;
650
651 Value *V0, *V1;
652 uint64_t C0, C1;
653 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
654 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
655 V0->getType() != V1->getType())
656 return false;
657
658 // If the scalar value 'I' is going to be re-inserted into a vector, then try
659 // to create an extract to that same element. The extract/insert can be
660 // reduced to a "select shuffle".
661 // TODO: If we add a larger pattern match that starts from an insert, this
662 // probably becomes unnecessary.
663 auto *Ext0 = cast<ExtractElementInst>(I0);
664 auto *Ext1 = cast<ExtractElementInst>(I1);
665 uint64_t InsertIndex = InvalidIndex;
666 if (I.hasOneUse())
667 match(I.user_back(),
668 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
669
670 ExtractElementInst *ExtractToChange;
671 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
672 return false;
673
674 Value *ExtOp0 = Ext0->getVectorOperand();
675 Value *ExtOp1 = Ext1->getVectorOperand();
676
677 if (ExtractToChange) {
678 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
679 Value *NewExtOp =
680 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
681 if (!NewExtOp)
682 return false;
683 if (ExtractToChange == Ext0)
684 ExtOp0 = NewExtOp;
685 else
686 ExtOp1 = NewExtOp;
687 }
688
689 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
690 : Ext0->getIndexOperand();
691 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
692 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
693 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
694 Worklist.push(Ext0);
695 Worklist.push(Ext1);
696 replaceValue(I, *NewExt);
697 return true;
698}
699
700/// Try to replace an extract + scalar fneg + insert with a vector fneg +
701/// shuffle.
702bool VectorCombine::foldInsExtFNeg(Instruction &I) {
703 // Match an insert (op (extract)) pattern.
704 Value *DstVec;
705 uint64_t ExtIdx, InsIdx;
706 Instruction *FNeg;
707 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
708 m_ConstantInt(InsIdx))))
709 return false;
710
711 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
712 Value *SrcVec;
713 Instruction *Extract;
714 if (!match(FNeg, m_FNeg(m_CombineAnd(
715 m_Instruction(Extract),
716 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
717 return false;
718
719 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
720 auto *DstVecScalarTy = DstVecTy->getScalarType();
721 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
722 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
723 return false;
724
725 // Ignore if insert/extract index is out of bounds or destination vector has
726 // one element
727 unsigned NumDstElts = DstVecTy->getNumElements();
728 unsigned NumSrcElts = SrcVecTy->getNumElements();
729 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
730 return false;
731
732 // We are inserting the negated element into the same lane that we extracted
733 // from. This is equivalent to a select-shuffle that chooses all but the
734 // negated element from the destination vector.
735 SmallVector<int> Mask(NumDstElts);
736 std::iota(Mask.begin(), Mask.end(), 0);
737 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
738 InstructionCost OldCost =
739 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
740 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
741
742 // If the extract has one use, it will be eliminated, so count it in the
743 // original cost. If it has more than one use, ignore the cost because it will
744 // be the same before/after.
745 if (Extract->hasOneUse())
746 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
747
748 InstructionCost NewCost =
749 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
751 DstVecTy, Mask, CostKind);
752
753 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
754 // If the lengths of the two vectors are not equal,
755 // we need to add a length-change vector. Add this cost.
756 SmallVector<int> SrcMask;
757 if (NeedLenChg) {
758 SrcMask.assign(NumDstElts, PoisonMaskElem);
759 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
761 DstVecTy, SrcVecTy, SrcMask, CostKind);
762 }
763
764 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
765 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
766 << "\n");
767 if (NewCost > OldCost)
768 return false;
769
770 Value *NewShuf, *LenChgShuf = nullptr;
771 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
772 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
773 if (NeedLenChg) {
774 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
775 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
776 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
777 Worklist.pushValue(LenChgShuf);
778 } else {
779 // shuffle DstVec, (fneg SrcVec), Mask
780 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
781 }
782
783 Worklist.pushValue(VecFNeg);
784 replaceValue(I, *NewShuf);
785 return true;
786}
787
788/// Try to fold insert(binop(x,y),binop(a,b),idx)
789/// --> binop(insert(x,a,idx),insert(y,b,idx))
790bool VectorCombine::foldInsExtBinop(Instruction &I) {
791 BinaryOperator *VecBinOp, *SclBinOp;
792 uint64_t Index;
793 if (!match(&I,
794 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
795 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
796 return false;
797
798 // TODO: Add support for addlike etc.
799 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
800 if (BinOpcode != SclBinOp->getOpcode())
801 return false;
802
803 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
804 if (!ResultTy)
805 return false;
806
807 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
808 // shuffle?
809
811 TTI.getInstructionCost(VecBinOp, CostKind) +
813 InstructionCost NewCost =
814 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
815 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
816 Index, VecBinOp->getOperand(0),
817 SclBinOp->getOperand(0)) +
818 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
819 Index, VecBinOp->getOperand(1),
820 SclBinOp->getOperand(1));
821
822 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
823 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
824 << "\n");
825 if (NewCost > OldCost)
826 return false;
827
828 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
829 SclBinOp->getOperand(0), Index);
830 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
831 SclBinOp->getOperand(1), Index);
832 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
833
834 // Intersect flags from the old binops.
835 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
836 NewInst->copyIRFlags(VecBinOp);
837 NewInst->andIRFlags(SclBinOp);
838 }
839
840 Worklist.pushValue(NewIns0);
841 Worklist.pushValue(NewIns1);
842 replaceValue(I, *NewBO);
843 return true;
844}
845
846/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
847/// Supports: bitcast, trunc, sext, zext
848bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
849 // Check if this is a bitwise logic operation
850 auto *BinOp = dyn_cast<BinaryOperator>(&I);
851 if (!BinOp || !BinOp->isBitwiseLogicOp())
852 return false;
853
854 // Get the cast instructions
855 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
856 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
857 if (!LHSCast || !RHSCast) {
858 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
859 return false;
860 }
861
862 // Both casts must be the same type
863 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
864 if (CastOpcode != RHSCast->getOpcode())
865 return false;
866
867 // Only handle supported cast operations
868 switch (CastOpcode) {
869 case Instruction::BitCast:
870 case Instruction::Trunc:
871 case Instruction::SExt:
872 case Instruction::ZExt:
873 break;
874 default:
875 return false;
876 }
877
878 Value *LHSSrc = LHSCast->getOperand(0);
879 Value *RHSSrc = RHSCast->getOperand(0);
880
881 // Source types must match
882 if (LHSSrc->getType() != RHSSrc->getType())
883 return false;
884
885 auto *SrcTy = LHSSrc->getType();
886 auto *DstTy = I.getType();
887 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
888 // Other casts only handle vector types with integer elements.
889 if (CastOpcode != Instruction::BitCast &&
890 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
891 return false;
892
893 // Only integer scalar/vector values are legal for bitwise logic operations.
894 if (!SrcTy->getScalarType()->isIntegerTy() ||
895 !DstTy->getScalarType()->isIntegerTy())
896 return false;
897
898 // Cost Check :
899 // OldCost = bitlogic + 2*casts
900 // NewCost = bitlogic + cast
901
902 // Calculate specific costs for each cast with instruction context
904 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
906 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
907
908 InstructionCost OldCost =
909 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
910 LHSCastCost + RHSCastCost;
911
912 // For new cost, we can't provide an instruction (it doesn't exist yet)
913 InstructionCost GenericCastCost = TTI.getCastInstrCost(
914 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
915
916 InstructionCost NewCost =
917 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
918 GenericCastCost;
919
920 // Account for multi-use casts using specific costs
921 if (!LHSCast->hasOneUse())
922 NewCost += LHSCastCost;
923 if (!RHSCast->hasOneUse())
924 NewCost += RHSCastCost;
925
926 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
927 << " NewCost=" << NewCost << "\n");
928
929 if (NewCost > OldCost)
930 return false;
931
932 // Create the operation on the source type
933 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
934 BinOp->getName() + ".inner");
935 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
936 NewBinOp->copyIRFlags(BinOp);
937
938 Worklist.pushValue(NewOp);
939
940 // Create the cast operation directly to ensure we get a new instruction
941 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
942
943 // Preserve cast instruction flags
944 NewCast->copyIRFlags(LHSCast);
945 NewCast->andIRFlags(RHSCast);
946
947 // Insert the new instruction
948 Value *Result = Builder.Insert(NewCast);
949
950 replaceValue(I, *Result);
951 return true;
952}
953
954/// Match:
955// bitop(castop(x), C) ->
956// bitop(castop(x), castop(InvC)) ->
957// castop(bitop(x, InvC))
958// Supports: bitcast
959bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
961 Constant *C;
962
963 // Check if this is a bitwise logic operation
965 return false;
966
967 // Get the cast instructions
968 auto *LHSCast = dyn_cast<CastInst>(LHS);
969 if (!LHSCast)
970 return false;
971
972 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
973
974 // Only handle supported cast operations
975 switch (CastOpcode) {
976 case Instruction::BitCast:
977 case Instruction::ZExt:
978 case Instruction::SExt:
979 case Instruction::Trunc:
980 break;
981 default:
982 return false;
983 }
984
985 Value *LHSSrc = LHSCast->getOperand(0);
986
987 auto *SrcTy = LHSSrc->getType();
988 auto *DstTy = I.getType();
989 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
990 // Other casts only handle vector types with integer elements.
991 if (CastOpcode != Instruction::BitCast &&
992 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
993 return false;
994
995 // Only integer scalar/vector values are legal for bitwise logic operations.
996 if (!SrcTy->getScalarType()->isIntegerTy() ||
997 !DstTy->getScalarType()->isIntegerTy())
998 return false;
999
1000 // Find the constant InvC, such that castop(InvC) equals to C.
1001 PreservedCastFlags RHSFlags;
1002 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1003 if (!InvC)
1004 return false;
1005
1006 // Cost Check :
1007 // OldCost = bitlogic + cast
1008 // NewCost = bitlogic + cast
1009
1010 // Calculate specific costs for each cast with instruction context
1011 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1012 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1013
1014 InstructionCost OldCost =
1015 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1016
1017 // For new cost, we can't provide an instruction (it doesn't exist yet)
1018 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1019 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1020
1021 InstructionCost NewCost =
1022 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1023 GenericCastCost;
1024
1025 // Account for multi-use casts using specific costs
1026 if (!LHSCast->hasOneUse())
1027 NewCost += LHSCastCost;
1028
1029 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1030 << " NewCost=" << NewCost << "\n");
1031
1032 if (NewCost > OldCost)
1033 return false;
1034
1035 // Create the operation on the source type
1036 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1037 LHSSrc, InvC, I.getName() + ".inner");
1038 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1039 NewBinOp->copyIRFlags(&I);
1040
1041 Worklist.pushValue(NewOp);
1042
1043 // Create the cast operation directly to ensure we get a new instruction
1044 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1045
1046 // Preserve cast instruction flags
1047 if (RHSFlags.NNeg)
1048 NewCast->setNonNeg();
1049 if (RHSFlags.NUW)
1050 NewCast->setHasNoUnsignedWrap();
1051 if (RHSFlags.NSW)
1052 NewCast->setHasNoSignedWrap();
1053
1054 NewCast->andIRFlags(LHSCast);
1055
1056 // Insert the new instruction
1057 Value *Result = Builder.Insert(NewCast);
1058
1059 replaceValue(I, *Result);
1060 return true;
1061}
1062
1063/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1064/// destination type followed by shuffle. This can enable further transforms by
1065/// moving bitcasts or shuffles together.
1066bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1067 Value *V0, *V1;
1068 ArrayRef<int> Mask;
1069 if (!match(&I, m_BitCast(m_OneUse(
1070 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1071 return false;
1072
1073 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1074 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1075 // mask for scalable type is a splat or not.
1076 // 2) Disallow non-vector casts.
1077 // TODO: We could allow any shuffle.
1078 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1079 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1080 if (!DestTy || !SrcTy)
1081 return false;
1082
1083 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1084 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1085 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1086 return false;
1087
1088 bool IsUnary = isa<UndefValue>(V1);
1089
1090 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1091 // if it won't increase the number of bitcasts.
1092 if (!IsUnary) {
1095 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1096 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1097 return false;
1098 }
1099
1100 SmallVector<int, 16> NewMask;
1101 if (DestEltSize <= SrcEltSize) {
1102 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1103 // always be expanded to the equivalent form choosing narrower elements.
1104 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1105 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1106 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1107 } else {
1108 // The bitcast is from narrow elements to wide elements. The shuffle mask
1109 // must choose consecutive elements to allow casting first.
1110 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1111 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1112 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1113 return false;
1114 }
1115
1116 // Bitcast the shuffle src - keep its original width but using the destination
1117 // scalar type.
1118 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1119 auto *NewShuffleTy =
1120 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1121 auto *OldShuffleTy =
1122 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1123 unsigned NumOps = IsUnary ? 1 : 2;
1124
1125 // The new shuffle must not cost more than the old shuffle.
1129
1130 InstructionCost NewCost =
1131 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1132 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1133 TargetTransformInfo::CastContextHint::None,
1134 CostKind));
1135 InstructionCost OldCost =
1136 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1137 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1138 TargetTransformInfo::CastContextHint::None,
1139 CostKind);
1140
1141 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1142 << OldCost << " vs NewCost: " << NewCost << "\n");
1143
1144 if (NewCost > OldCost || !NewCost.isValid())
1145 return false;
1146
1147 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1148 ++NumShufOfBitcast;
1149 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1150 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1151 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1152 replaceValue(I, *Shuf);
1153 return true;
1154}
1155
1156/// VP Intrinsics whose vector operands are both splat values may be simplified
1157/// into the scalar version of the operation and the result splatted. This
1158/// can lead to scalarization down the line.
1159bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1160 if (!isa<VPIntrinsic>(I))
1161 return false;
1162 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1163 Value *Op0 = VPI.getArgOperand(0);
1164 Value *Op1 = VPI.getArgOperand(1);
1165
1166 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1167 return false;
1168
1169 // Check getSplatValue early in this function, to avoid doing unnecessary
1170 // work.
1171 Value *ScalarOp0 = getSplatValue(Op0);
1172 Value *ScalarOp1 = getSplatValue(Op1);
1173 if (!ScalarOp0 || !ScalarOp1)
1174 return false;
1175
1176 // For the binary VP intrinsics supported here, the result on disabled lanes
1177 // is a poison value. For now, only do this simplification if all lanes
1178 // are active.
1179 // TODO: Relax the condition that all lanes are active by using insertelement
1180 // on inactive lanes.
1181 auto IsAllTrueMask = [](Value *MaskVal) {
1182 if (Value *SplattedVal = getSplatValue(MaskVal))
1183 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1184 return ConstValue->isAllOnesValue();
1185 return false;
1186 };
1187 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1188 return false;
1189
1190 // Check to make sure we support scalarization of the intrinsic
1191 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1192 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1193 return false;
1194
1195 // Calculate cost of splatting both operands into vectors and the vector
1196 // intrinsic
1197 VectorType *VecTy = cast<VectorType>(VPI.getType());
1198 SmallVector<int> Mask;
1199 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1200 Mask.resize(FVTy->getNumElements(), 0);
1201 InstructionCost SplatCost =
1202 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1204 CostKind);
1205
1206 // Calculate the cost of the VP Intrinsic
1208 for (Value *V : VPI.args())
1209 Args.push_back(V->getType());
1210 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1211 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1212 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1213
1214 // Determine scalar opcode
1215 std::optional<unsigned> FunctionalOpcode =
1216 VPI.getFunctionalOpcode();
1217 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1218 if (!FunctionalOpcode) {
1219 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1220 if (!ScalarIntrID)
1221 return false;
1222 }
1223
1224 // Calculate cost of scalarizing
1225 InstructionCost ScalarOpCost = 0;
1226 if (ScalarIntrID) {
1227 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1228 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1229 } else {
1230 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1231 VecTy->getScalarType(), CostKind);
1232 }
1233
1234 // The existing splats may be kept around if other instructions use them.
1235 InstructionCost CostToKeepSplats =
1236 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1237 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1238
1239 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1240 << "\n");
1241 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1242 << ", Cost of scalarizing:" << NewCost << "\n");
1243
1244 // We want to scalarize unless the vector variant actually has lower cost.
1245 if (OldCost < NewCost || !NewCost.isValid())
1246 return false;
1247
1248 // Scalarize the intrinsic
1249 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1250 Value *EVL = VPI.getArgOperand(3);
1251
1252 // If the VP op might introduce UB or poison, we can scalarize it provided
1253 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1254 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1255 // scalarizing it.
1256 bool SafeToSpeculate;
1257 if (ScalarIntrID)
1258 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1259 .hasAttribute(Attribute::AttrKind::Speculatable);
1260 else
1262 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1263 if (!SafeToSpeculate &&
1264 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1265 return false;
1266
1267 Value *ScalarVal =
1268 ScalarIntrID
1269 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1270 {ScalarOp0, ScalarOp1})
1271 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1272 ScalarOp0, ScalarOp1);
1273
1274 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1275 return true;
1276}
1277
1278/// Match a vector op/compare/intrinsic with at least one
1279/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1280/// by insertelement.
1281bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1282 auto *UO = dyn_cast<UnaryOperator>(&I);
1283 auto *BO = dyn_cast<BinaryOperator>(&I);
1284 auto *CI = dyn_cast<CmpInst>(&I);
1285 auto *II = dyn_cast<IntrinsicInst>(&I);
1286 if (!UO && !BO && !CI && !II)
1287 return false;
1288
1289 // TODO: Allow intrinsics with different argument types
1290 if (II) {
1291 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1292 return false;
1293 for (auto [Idx, Arg] : enumerate(II->args()))
1294 if (Arg->getType() != II->getType() &&
1295 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1296 return false;
1297 }
1298
1299 // Do not convert the vector condition of a vector select into a scalar
1300 // condition. That may cause problems for codegen because of differences in
1301 // boolean formats and register-file transfers.
1302 // TODO: Can we account for that in the cost model?
1303 if (CI)
1304 for (User *U : I.users())
1305 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1306 return false;
1307
1308 // Match constant vectors or scalars being inserted into constant vectors:
1309 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1310 SmallVector<Value *> VecCs, ScalarOps;
1311 std::optional<uint64_t> Index;
1312
1313 auto Ops = II ? II->args() : I.operands();
1314 for (auto [OpNum, Op] : enumerate(Ops)) {
1315 Constant *VecC;
1316 Value *V;
1317 uint64_t InsIdx = 0;
1318 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1319 m_ConstantInt(InsIdx)))) {
1320 // Bail if any inserts are out of bounds.
1321 VectorType *OpTy = cast<VectorType>(Op->getType());
1322 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1323 return false;
1324 // All inserts must have the same index.
1325 // TODO: Deal with mismatched index constants and variable indexes?
1326 if (!Index)
1327 Index = InsIdx;
1328 else if (InsIdx != *Index)
1329 return false;
1330 VecCs.push_back(VecC);
1331 ScalarOps.push_back(V);
1332 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1333 OpNum, &TTI)) {
1334 VecCs.push_back(Op.get());
1335 ScalarOps.push_back(Op.get());
1336 } else if (match(Op.get(), m_Constant(VecC))) {
1337 VecCs.push_back(VecC);
1338 ScalarOps.push_back(nullptr);
1339 } else {
1340 return false;
1341 }
1342 }
1343
1344 // Bail if all operands are constant.
1345 if (!Index.has_value())
1346 return false;
1347
1348 VectorType *VecTy = cast<VectorType>(I.getType());
1349 Type *ScalarTy = VecTy->getScalarType();
1350 assert(VecTy->isVectorTy() &&
1351 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1352 ScalarTy->isPointerTy()) &&
1353 "Unexpected types for insert element into binop or cmp");
1354
1355 unsigned Opcode = I.getOpcode();
1356 InstructionCost ScalarOpCost, VectorOpCost;
1357 if (CI) {
1358 CmpInst::Predicate Pred = CI->getPredicate();
1359 ScalarOpCost = TTI.getCmpSelInstrCost(
1360 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1361 VectorOpCost = TTI.getCmpSelInstrCost(
1362 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1363 } else if (UO || BO) {
1364 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1365 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1366 } else {
1367 IntrinsicCostAttributes ScalarICA(
1368 II->getIntrinsicID(), ScalarTy,
1369 SmallVector<Type *>(II->arg_size(), ScalarTy));
1370 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1371 IntrinsicCostAttributes VectorICA(
1372 II->getIntrinsicID(), VecTy,
1373 SmallVector<Type *>(II->arg_size(), VecTy));
1374 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1375 }
1376
1377 // Fold the vector constants in the original vectors into a new base vector to
1378 // get more accurate cost modelling.
1379 Value *NewVecC = nullptr;
1380 if (CI)
1381 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1382 else if (UO)
1383 NewVecC =
1384 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1385 else if (BO)
1386 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1387 else if (II)
1388 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1389
1390 if (!NewVecC)
1391 return false;
1392
1393 // Get cost estimate for the insert element. This cost will factor into
1394 // both sequences.
1395 InstructionCost OldCost = VectorOpCost;
1396 InstructionCost NewCost =
1397 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1398 CostKind, *Index, NewVecC);
1399
1400 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1401 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1402 II->getIntrinsicID(), Idx, &TTI)))
1403 continue;
1405 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1406 OldCost += InsertCost;
1407 NewCost += !Op->hasOneUse() * InsertCost;
1408 }
1409
1410 // We want to scalarize unless the vector variant actually has lower cost.
1411 if (OldCost < NewCost || !NewCost.isValid())
1412 return false;
1413
1414 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1415 // inselt NewVecC, (scalar_op V0, V1), Index
1416 if (CI)
1417 ++NumScalarCmp;
1418 else if (UO || BO)
1419 ++NumScalarOps;
1420 else
1421 ++NumScalarIntrinsic;
1422
1423 // For constant cases, extract the scalar element, this should constant fold.
1424 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1425 if (!Scalar)
1427 cast<Constant>(VecC), Builder.getInt64(*Index));
1428
1429 Value *Scalar;
1430 if (CI)
1431 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1432 else if (UO || BO)
1433 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1434 else
1435 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1436
1437 Scalar->setName(I.getName() + ".scalar");
1438
1439 // All IR flags are safe to back-propagate. There is no potential for extra
1440 // poison to be created by the scalar instruction.
1441 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1442 ScalarInst->copyIRFlags(&I);
1443
1444 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1445 replaceValue(I, *Insert);
1446 return true;
1447}
1448
1449/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1450/// a vector into vector operations followed by extract. Note: The SLP pass
1451/// may miss this pattern because of implementation problems.
1452bool VectorCombine::foldExtractedCmps(Instruction &I) {
1453 auto *BI = dyn_cast<BinaryOperator>(&I);
1454
1455 // We are looking for a scalar binop of booleans.
1456 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1457 if (!BI || !I.getType()->isIntegerTy(1))
1458 return false;
1459
1460 // The compare predicates should match, and each compare should have a
1461 // constant operand.
1462 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1463 Instruction *I0, *I1;
1464 Constant *C0, *C1;
1465 CmpPredicate P0, P1;
1466 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1467 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1468 return false;
1469
1470 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1471 if (!MatchingPred)
1472 return false;
1473
1474 // The compare operands must be extracts of the same vector with constant
1475 // extract indexes.
1476 Value *X;
1477 uint64_t Index0, Index1;
1478 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1479 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1480 return false;
1481
1482 auto *Ext0 = cast<ExtractElementInst>(I0);
1483 auto *Ext1 = cast<ExtractElementInst>(I1);
1484 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1485 if (!ConvertToShuf)
1486 return false;
1487 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1488 "Unknown ExtractElementInst");
1489
1490 // The original scalar pattern is:
1491 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1492 CmpInst::Predicate Pred = *MatchingPred;
1493 unsigned CmpOpcode =
1494 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1495 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1496 if (!VecTy)
1497 return false;
1498
1499 InstructionCost Ext0Cost =
1500 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1501 InstructionCost Ext1Cost =
1502 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1504 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1505 CostKind);
1506
1507 InstructionCost OldCost =
1508 Ext0Cost + Ext1Cost + CmpCost * 2 +
1509 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1510
1511 // The proposed vector pattern is:
1512 // vcmp = cmp Pred X, VecC
1513 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1514 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1515 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1518 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1519 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1520 ShufMask[CheapIndex] = ExpensiveIndex;
1522 CmpTy, ShufMask, CostKind);
1523 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1524 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1525 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1526 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1527
1528 // Aggressively form vector ops if the cost is equal because the transform
1529 // may enable further optimization.
1530 // Codegen can reverse this transform (scalarize) if it was not profitable.
1531 if (OldCost < NewCost || !NewCost.isValid())
1532 return false;
1533
1534 // Create a vector constant from the 2 scalar constants.
1535 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1536 PoisonValue::get(VecTy->getElementType()));
1537 CmpC[Index0] = C0;
1538 CmpC[Index1] = C1;
1539 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1540 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1541 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1542 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1543 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1544 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1545 replaceValue(I, *NewExt);
1546 ++NumVecCmpBO;
1547 return true;
1548}
1549
1550/// Try to fold scalar selects that select between extracted elements and zero
1551/// into extracting from a vector select. This is rooted at the bitcast.
1552///
1553/// This pattern arises when a vector is bitcast to a smaller element type,
1554/// elements are extracted, and then conditionally selected with zero:
1555///
1556/// %bc = bitcast <4 x i32> %src to <16 x i8>
1557/// %e0 = extractelement <16 x i8> %bc, i32 0
1558/// %s0 = select i1 %cond, i8 %e0, i8 0
1559/// %e1 = extractelement <16 x i8> %bc, i32 1
1560/// %s1 = select i1 %cond, i8 %e1, i8 0
1561/// ...
1562///
1563/// Transforms to:
1564/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1565/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1566/// %e0 = extractelement <16 x i8> %bc, i32 0
1567/// %e1 = extractelement <16 x i8> %bc, i32 1
1568/// ...
1569///
1570/// This is profitable because vector select on wider types produces fewer
1571/// select/cndmask instructions than scalar selects on each element.
1572bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1573 auto *BC = dyn_cast<BitCastInst>(&I);
1574 if (!BC)
1575 return false;
1576
1577 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1578 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1579 if (!SrcVecTy || !DstVecTy)
1580 return false;
1581
1582 // Source must be 32-bit or 64-bit elements, destination must be smaller
1583 // integer elements. Zero in all these types is all-bits-zero.
1584 Type *SrcEltTy = SrcVecTy->getElementType();
1585 Type *DstEltTy = DstVecTy->getElementType();
1586 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1587 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1588
1589 if (SrcEltBits != 32 && SrcEltBits != 64)
1590 return false;
1591
1592 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1593 return false;
1594
1595 // Check profitability using TTI before collecting users.
1596 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1597 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1598
1599 InstructionCost ScalarSelCost =
1600 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1602 InstructionCost VecSelCost =
1603 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1605
1606 // We need at least this many selects for vectorization to be profitable.
1607 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1608 // ScalarSelCost
1609 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1610 return false;
1611
1612 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1613
1614 // Quick check: if bitcast doesn't have enough users, bail early.
1615 if (!BC->hasNUsesOrMore(MinSelects))
1616 return false;
1617
1618 // Collect all select users that match the pattern, grouped by condition.
1619 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1620 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1621
1622 for (User *U : BC->users()) {
1623 auto *Ext = dyn_cast<ExtractElementInst>(U);
1624 if (!Ext)
1625 continue;
1626
1627 for (User *ExtUser : Ext->users()) {
1628 Value *Cond;
1629 // Match: select i1 %cond, %ext, 0
1630 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1631 Cond->getType()->isIntegerTy(1))
1632 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1633 }
1634 }
1635
1636 if (CondToSelects.empty())
1637 return false;
1638
1639 bool MadeChange = false;
1640 Value *SrcVec = BC->getOperand(0);
1641
1642 // Process each group of selects with the same condition.
1643 for (auto [Cond, Selects] : CondToSelects) {
1644 // Only profitable if vector select cost < total scalar select cost.
1645 if (Selects.size() < MinSelects) {
1646 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1647 << "profitable (VecCost=" << VecSelCost
1648 << ", ScalarCost=" << ScalarSelCost
1649 << ", NumSelects=" << Selects.size() << ")\n");
1650 continue;
1651 }
1652
1653 // Create the vector select and bitcast once for this condition.
1654 Builder.SetInsertPoint(BC->getNextNode());
1655 Value *VecSel =
1656 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1657 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1658
1659 // Replace each scalar select with an extract from the new bitcast.
1660 for (SelectInst *Sel : Selects) {
1661 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1662 Value *Idx = Ext->getIndexOperand();
1663
1664 Builder.SetInsertPoint(Sel);
1665 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1666 replaceValue(*Sel, *NewExt);
1667 MadeChange = true;
1668 }
1669
1670 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1671 << " selects into vector select\n");
1672 }
1673
1674 return MadeChange;
1675}
1676
1679 const TargetTransformInfo &TTI,
1680 InstructionCost &CostBeforeReduction,
1681 InstructionCost &CostAfterReduction) {
1682 Instruction *Op0, *Op1;
1683 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1684 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1685 unsigned ReductionOpc =
1686 getArithmeticReductionInstruction(II.getIntrinsicID());
1687 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1688 bool IsUnsigned = isa<ZExtInst>(RedOp);
1689 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1690
1691 CostBeforeReduction =
1692 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1694 CostAfterReduction =
1695 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1696 ExtType, FastMathFlags(), CostKind);
1697 return;
1698 }
1699 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1700 match(RedOp,
1702 match(Op0, m_ZExtOrSExt(m_Value())) &&
1703 Op0->getOpcode() == Op1->getOpcode() &&
1704 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1705 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1706 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1707 bool IsUnsigned = isa<ZExtInst>(Op0);
1708 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1709 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1710
1711 InstructionCost ExtCost =
1712 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1714 InstructionCost MulCost =
1715 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1716 InstructionCost Ext2Cost =
1717 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1719
1720 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1721 CostAfterReduction = TTI.getMulAccReductionCost(
1722 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1723 return;
1724 }
1725 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1726 std::nullopt, CostKind);
1727}
1728
1729bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1730 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1731 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1732 if (BinOpOpc == Instruction::Sub)
1733 ReductionIID = Intrinsic::vector_reduce_add;
1734 if (ReductionIID == Intrinsic::not_intrinsic)
1735 return false;
1736
1737 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1738 Intrinsic::ID IID) -> Value * {
1739 auto *II = dyn_cast<IntrinsicInst>(V);
1740 if (!II)
1741 return nullptr;
1742 if (II->getIntrinsicID() == IID && II->hasOneUse())
1743 return II->getArgOperand(0);
1744 return nullptr;
1745 };
1746
1747 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1748 if (!V0)
1749 return false;
1750 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1751 if (!V1)
1752 return false;
1753
1754 auto *VTy = cast<VectorType>(V0->getType());
1755 if (V1->getType() != VTy)
1756 return false;
1757 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1758 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1759 unsigned ReductionOpc =
1760 getArithmeticReductionInstruction(II0.getIntrinsicID());
1761
1762 InstructionCost OldCost = 0;
1763 InstructionCost NewCost = 0;
1764 InstructionCost CostOfRedOperand0 = 0;
1765 InstructionCost CostOfRed0 = 0;
1766 InstructionCost CostOfRedOperand1 = 0;
1767 InstructionCost CostOfRed1 = 0;
1768 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1769 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1770 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1771 NewCost =
1772 CostOfRedOperand0 + CostOfRedOperand1 +
1773 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1774 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1775 if (NewCost >= OldCost || !NewCost.isValid())
1776 return false;
1777
1778 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1779 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1780 << "\n");
1781 Value *VectorBO;
1782 if (BinOpOpc == Instruction::Or)
1783 VectorBO = Builder.CreateOr(V0, V1, "",
1784 cast<PossiblyDisjointInst>(I).isDisjoint());
1785 else
1786 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1787
1788 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1789 replaceValue(I, *Rdx);
1790 return true;
1791}
1792
1793// Check if memory loc modified between two instrs in the same BB
1796 const MemoryLocation &Loc, AAResults &AA) {
1797 unsigned NumScanned = 0;
1798 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1799 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1800 ++NumScanned > MaxInstrsToScan;
1801 });
1802}
1803
1804namespace {
1805/// Helper class to indicate whether a vector index can be safely scalarized and
1806/// if a freeze needs to be inserted.
1807class ScalarizationResult {
1808 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1809
1810 StatusTy Status;
1811 Value *ToFreeze;
1812
1813 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1814 : Status(Status), ToFreeze(ToFreeze) {}
1815
1816public:
1817 ScalarizationResult(const ScalarizationResult &Other) = default;
1818 ~ScalarizationResult() {
1819 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1820 }
1821
1822 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1823 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1824 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1825 return {StatusTy::SafeWithFreeze, ToFreeze};
1826 }
1827
1828 /// Returns true if the index can be scalarize without requiring a freeze.
1829 bool isSafe() const { return Status == StatusTy::Safe; }
1830 /// Returns true if the index cannot be scalarized.
1831 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1832 /// Returns true if the index can be scalarize, but requires inserting a
1833 /// freeze.
1834 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1835
1836 /// Reset the state of Unsafe and clear ToFreze if set.
1837 void discard() {
1838 ToFreeze = nullptr;
1839 Status = StatusTy::Unsafe;
1840 }
1841
1842 /// Freeze the ToFreeze and update the use in \p User to use it.
1843 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1844 assert(isSafeWithFreeze() &&
1845 "should only be used when freezing is required");
1846 assert(is_contained(ToFreeze->users(), &UserI) &&
1847 "UserI must be a user of ToFreeze");
1848 IRBuilder<>::InsertPointGuard Guard(Builder);
1849 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1850 Value *Frozen =
1851 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1852 for (Use &U : make_early_inc_range((UserI.operands())))
1853 if (U.get() == ToFreeze)
1854 U.set(Frozen);
1855
1856 ToFreeze = nullptr;
1857 }
1858};
1859} // namespace
1860
1861/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1862/// Idx. \p Idx must access a valid vector element.
1863static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1864 Instruction *CtxI,
1865 AssumptionCache &AC,
1866 const DominatorTree &DT) {
1867 // We do checks for both fixed vector types and scalable vector types.
1868 // This is the number of elements of fixed vector types,
1869 // or the minimum number of elements of scalable vector types.
1870 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1871 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1872
1873 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1874 if (C->getValue().ult(NumElements))
1875 return ScalarizationResult::safe();
1876 return ScalarizationResult::unsafe();
1877 }
1878
1879 // Always unsafe if the index type can't handle all inbound values.
1880 if (!llvm::isUIntN(IntWidth, NumElements))
1881 return ScalarizationResult::unsafe();
1882
1883 APInt Zero(IntWidth, 0);
1884 APInt MaxElts(IntWidth, NumElements);
1885 ConstantRange ValidIndices(Zero, MaxElts);
1886 ConstantRange IdxRange(IntWidth, true);
1887
1888 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1889 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1890 true, &AC, CtxI, &DT)))
1891 return ScalarizationResult::safe();
1892 return ScalarizationResult::unsafe();
1893 }
1894
1895 // If the index may be poison, check if we can insert a freeze before the
1896 // range of the index is restricted.
1897 Value *IdxBase;
1898 ConstantInt *CI;
1899 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1900 IdxRange = IdxRange.binaryAnd(CI->getValue());
1901 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1902 IdxRange = IdxRange.urem(CI->getValue());
1903 }
1904
1905 if (ValidIndices.contains(IdxRange))
1906 return ScalarizationResult::safeWithFreeze(IdxBase);
1907 return ScalarizationResult::unsafe();
1908}
1909
1910/// The memory operation on a vector of \p ScalarType had alignment of
1911/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1912/// alignment that will be valid for the memory operation on a single scalar
1913/// element of the same type with index \p Idx.
1915 Type *ScalarType, Value *Idx,
1916 const DataLayout &DL) {
1917 if (auto *C = dyn_cast<ConstantInt>(Idx))
1918 return commonAlignment(VectorAlignment,
1919 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1920 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1921}
1922
1923// Combine patterns like:
1924// %0 = load <4 x i32>, <4 x i32>* %a
1925// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1926// store <4 x i32> %1, <4 x i32>* %a
1927// to:
1928// %0 = bitcast <4 x i32>* %a to i32*
1929// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1930// store i32 %b, i32* %1
1931bool VectorCombine::foldSingleElementStore(Instruction &I) {
1933 return false;
1934 auto *SI = cast<StoreInst>(&I);
1935 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1936 return false;
1937
1938 // TODO: Combine more complicated patterns (multiple insert) by referencing
1939 // TargetTransformInfo.
1941 Value *NewElement;
1942 Value *Idx;
1943 if (!match(SI->getValueOperand(),
1944 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1945 m_Value(Idx))))
1946 return false;
1947
1948 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1949 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1950 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1951 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1952 // modified between, vector type matches store size, and index is inbounds.
1953 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1954 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1955 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1956 return false;
1957
1958 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1959 if (ScalarizableIdx.isUnsafe() ||
1960 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1961 MemoryLocation::get(SI), AA))
1962 return false;
1963
1964 // Ensure we add the load back to the worklist BEFORE its users so they can
1965 // erased in the correct order.
1966 Worklist.push(Load);
1967
1968 if (ScalarizableIdx.isSafeWithFreeze())
1969 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1970 Value *GEP = Builder.CreateInBoundsGEP(
1971 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1972 {ConstantInt::get(Idx->getType(), 0), Idx});
1973 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1974 NSI->copyMetadata(*SI);
1975 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1976 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1977 *DL);
1978 NSI->setAlignment(ScalarOpAlignment);
1979 replaceValue(I, *NSI);
1981 return true;
1982 }
1983
1984 return false;
1985}
1986
1987/// Try to scalarize vector loads feeding extractelement or bitcast
1988/// instructions.
1989bool VectorCombine::scalarizeLoad(Instruction &I) {
1990 Value *Ptr;
1991 if (!match(&I, m_Load(m_Value(Ptr))))
1992 return false;
1993
1994 auto *LI = cast<LoadInst>(&I);
1995 auto *VecTy = cast<VectorType>(LI->getType());
1996 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1997 return false;
1998
1999 bool AllExtracts = true;
2000 bool AllBitcasts = true;
2001 Instruction *LastCheckedInst = LI;
2002 unsigned NumInstChecked = 0;
2003
2004 // Check what type of users we have (must either all be extracts or
2005 // bitcasts) and ensure no memory modifications between the load and
2006 // its users.
2007 for (User *U : LI->users()) {
2008 auto *UI = dyn_cast<Instruction>(U);
2009 if (!UI || UI->getParent() != LI->getParent())
2010 return false;
2011
2012 // If any user is waiting to be erased, then bail out as this will
2013 // distort the cost calculation and possibly lead to infinite loops.
2014 if (UI->use_empty())
2015 return false;
2016
2017 if (!isa<ExtractElementInst>(UI))
2018 AllExtracts = false;
2019 if (!isa<BitCastInst>(UI))
2020 AllBitcasts = false;
2021
2022 // Check if any instruction between the load and the user may modify memory.
2023 if (LastCheckedInst->comesBefore(UI)) {
2024 for (Instruction &I :
2025 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2026 // Bail out if we reached the check limit or the instruction may write
2027 // to memory.
2028 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2029 return false;
2030 NumInstChecked++;
2031 }
2032 LastCheckedInst = UI;
2033 }
2034 }
2035
2036 if (AllExtracts)
2037 return scalarizeLoadExtract(LI, VecTy, Ptr);
2038 if (AllBitcasts)
2039 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2040 return false;
2041}
2042
2043/// Try to scalarize vector loads feeding extractelement instructions.
2044bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2045 Value *Ptr) {
2047 return false;
2048
2049 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2050 llvm::scope_exit FailureGuard([&]() {
2051 // If the transform is aborted, discard the ScalarizationResults.
2052 for (auto &Pair : NeedFreeze)
2053 Pair.second.discard();
2054 });
2055
2056 InstructionCost OriginalCost =
2057 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2059 InstructionCost ScalarizedCost = 0;
2060
2061 for (User *U : LI->users()) {
2062 auto *UI = cast<ExtractElementInst>(U);
2063
2064 auto ScalarIdx =
2065 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
2066 if (ScalarIdx.isUnsafe())
2067 return false;
2068 if (ScalarIdx.isSafeWithFreeze()) {
2069 NeedFreeze.try_emplace(UI, ScalarIdx);
2070 ScalarIdx.discard();
2071 }
2072
2073 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2074 OriginalCost +=
2075 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2076 Index ? Index->getZExtValue() : -1);
2077 ScalarizedCost +=
2078 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2080 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2081 nullptr, nullptr, CostKind);
2082 }
2083
2084 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2085 << "\n LoadExtractCost: " << OriginalCost
2086 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2087
2088 if (ScalarizedCost >= OriginalCost)
2089 return false;
2090
2091 // Ensure we add the load back to the worklist BEFORE its users so they can
2092 // erased in the correct order.
2093 Worklist.push(LI);
2094
2095 Type *ElemType = VecTy->getElementType();
2096
2097 // Replace extracts with narrow scalar loads.
2098 for (User *U : LI->users()) {
2099 auto *EI = cast<ExtractElementInst>(U);
2100 Value *Idx = EI->getIndexOperand();
2101
2102 // Insert 'freeze' for poison indexes.
2103 auto It = NeedFreeze.find(EI);
2104 if (It != NeedFreeze.end())
2105 It->second.freeze(Builder, *cast<Instruction>(Idx));
2106
2107 Builder.SetInsertPoint(EI);
2108 Value *GEP =
2109 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2110 auto *NewLoad = cast<LoadInst>(
2111 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2112
2113 Align ScalarOpAlignment =
2114 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2115 NewLoad->setAlignment(ScalarOpAlignment);
2116
2117 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2118 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2119 AAMDNodes OldAAMD = LI->getAAMetadata();
2120 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2121 }
2122
2123 replaceValue(*EI, *NewLoad, false);
2124 }
2125
2126 FailureGuard.release();
2127 return true;
2128}
2129
2130/// Try to scalarize vector loads feeding bitcast instructions.
2131bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2132 Value *Ptr) {
2133 InstructionCost OriginalCost =
2134 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2136
2137 Type *TargetScalarType = nullptr;
2138 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2139
2140 for (User *U : LI->users()) {
2141 auto *BC = cast<BitCastInst>(U);
2142
2143 Type *DestTy = BC->getDestTy();
2144 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2145 return false;
2146
2147 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2148 if (DestBitWidth != VecBitWidth)
2149 return false;
2150
2151 // All bitcasts must target the same scalar type.
2152 if (!TargetScalarType)
2153 TargetScalarType = DestTy;
2154 else if (TargetScalarType != DestTy)
2155 return false;
2156
2157 OriginalCost +=
2158 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2160 }
2161
2162 if (!TargetScalarType)
2163 return false;
2164
2165 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2166 InstructionCost ScalarizedCost =
2167 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2169
2170 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2171 << "\n OriginalCost: " << OriginalCost
2172 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2173
2174 if (ScalarizedCost >= OriginalCost)
2175 return false;
2176
2177 // Ensure we add the load back to the worklist BEFORE its users so they can
2178 // erased in the correct order.
2179 Worklist.push(LI);
2180
2181 Builder.SetInsertPoint(LI);
2182 auto *ScalarLoad =
2183 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2184 ScalarLoad->setAlignment(LI->getAlign());
2185 ScalarLoad->copyMetadata(*LI);
2186
2187 // Replace all bitcast users with the scalar load.
2188 for (User *U : LI->users()) {
2189 auto *BC = cast<BitCastInst>(U);
2190 replaceValue(*BC, *ScalarLoad, false);
2191 }
2192
2193 return true;
2194}
2195
2196bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2198 return false;
2199 auto *Ext = dyn_cast<ZExtInst>(&I);
2200 if (!Ext)
2201 return false;
2202
2203 // Try to convert a vector zext feeding only extracts to a set of scalar
2204 // (Src << ExtIdx *Size) & (Size -1)
2205 // if profitable .
2206 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2207 if (!SrcTy)
2208 return false;
2209 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2210
2211 Type *ScalarDstTy = DstTy->getElementType();
2212 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2213 return false;
2214
2215 InstructionCost VectorCost =
2216 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2218 unsigned ExtCnt = 0;
2219 bool ExtLane0 = false;
2220 for (User *U : Ext->users()) {
2221 uint64_t Idx;
2222 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2223 return false;
2224 if (cast<Instruction>(U)->use_empty())
2225 continue;
2226 ExtCnt += 1;
2227 ExtLane0 |= !Idx;
2228 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2229 CostKind, Idx, U);
2230 }
2231
2232 InstructionCost ScalarCost =
2233 ExtCnt * TTI.getArithmeticInstrCost(
2234 Instruction::And, ScalarDstTy, CostKind,
2237 (ExtCnt - ExtLane0) *
2239 Instruction::LShr, ScalarDstTy, CostKind,
2242 if (ScalarCost > VectorCost)
2243 return false;
2244
2245 Value *ScalarV = Ext->getOperand(0);
2246 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2247 &DT)) {
2248 // Check wether all lanes are extracted, all extracts trigger UB
2249 // on poison, and the last extract (and hence all previous ones)
2250 // are guaranteed to execute if Ext executes. If so, we do not
2251 // need to insert a freeze.
2252 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2253 bool AllExtractsTriggerUB = true;
2254 ExtractElementInst *LastExtract = nullptr;
2255 BasicBlock *ExtBB = Ext->getParent();
2256 for (User *U : Ext->users()) {
2257 auto *Extract = cast<ExtractElementInst>(U);
2258 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2259 AllExtractsTriggerUB = false;
2260 break;
2261 }
2262 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2263 if (!LastExtract || LastExtract->comesBefore(Extract))
2264 LastExtract = Extract;
2265 }
2266 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2267 !AllExtractsTriggerUB ||
2269 LastExtract->getIterator()))
2270 ScalarV = Builder.CreateFreeze(ScalarV);
2271 }
2272 ScalarV = Builder.CreateBitCast(
2273 ScalarV,
2274 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2275 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2276 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2277 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2278 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2279 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2280 for (User *U : Ext->users()) {
2281 auto *Extract = cast<ExtractElementInst>(U);
2282 uint64_t Idx =
2283 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2284 uint64_t ShiftAmt =
2285 DL->isBigEndian()
2286 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2287 : (Idx * SrcEltSizeInBits);
2288 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2289 Value *And = Builder.CreateAnd(LShr, Mask);
2290 U->replaceAllUsesWith(And);
2291 }
2292 return true;
2293}
2294
2295/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2296/// to "(bitcast (concat X, Y))"
2297/// where X/Y are bitcasted from i1 mask vectors.
2298bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2299 Type *Ty = I.getType();
2300 if (!Ty->isIntegerTy())
2301 return false;
2302
2303 // TODO: Add big endian test coverage
2304 if (DL->isBigEndian())
2305 return false;
2306
2307 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2308 Instruction *X, *Y;
2310 return false;
2311
2312 // Allow both sources to contain shl, to handle more generic pattern:
2313 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2314 Value *SrcX;
2315 uint64_t ShAmtX = 0;
2316 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2317 !match(X, m_OneUse(
2319 m_ConstantInt(ShAmtX)))))
2320 return false;
2321
2322 Value *SrcY;
2323 uint64_t ShAmtY = 0;
2324 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2325 !match(Y, m_OneUse(
2327 m_ConstantInt(ShAmtY)))))
2328 return false;
2329
2330 // Canonicalize larger shift to the RHS.
2331 if (ShAmtX > ShAmtY) {
2332 std::swap(X, Y);
2333 std::swap(SrcX, SrcY);
2334 std::swap(ShAmtX, ShAmtY);
2335 }
2336
2337 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2338 // difference is the mask width so they can be easily concatenated together.
2339 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2340 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2341 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2342 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2343 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2344 !MaskTy->getElementType()->isIntegerTy(1) ||
2345 MaskTy->getNumElements() != ShAmtDiff ||
2346 MaskTy->getNumElements() > (BitWidth / 2))
2347 return false;
2348
2349 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2350 auto *ConcatIntTy =
2351 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2352 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2353
2354 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2355 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2356
2357 // TODO: Is it worth supporting multi use cases?
2358 InstructionCost OldCost = 0;
2359 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2360 OldCost +=
2361 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2362 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2364 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2366
2367 InstructionCost NewCost = 0;
2369 MaskTy, ConcatMask, CostKind);
2370 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2372 if (Ty != ConcatIntTy)
2373 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2375 if (ShAmtX > 0)
2376 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2377
2378 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2379 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2380 << "\n");
2381
2382 if (NewCost > OldCost)
2383 return false;
2384
2385 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2386 // any residual zero-extension or shifting.
2387 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2388 Worklist.pushValue(Concat);
2389
2390 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2391
2392 if (Ty != ConcatIntTy) {
2393 Worklist.pushValue(Result);
2394 Result = Builder.CreateZExt(Result, Ty);
2395 }
2396
2397 if (ShAmtX > 0) {
2398 Worklist.pushValue(Result);
2399 Result = Builder.CreateShl(Result, ShAmtX);
2400 }
2401
2402 replaceValue(I, *Result);
2403 return true;
2404}
2405
2406/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2407/// --> "binop (shuffle), (shuffle)".
2408bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2409 BinaryOperator *BinOp;
2410 ArrayRef<int> OuterMask;
2411 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2412 return false;
2413
2414 // Don't introduce poison into div/rem.
2415 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2416 return false;
2417
2418 Value *Op00, *Op01, *Op10, *Op11;
2419 ArrayRef<int> Mask0, Mask1;
2420 bool Match0 = match(BinOp->getOperand(0),
2421 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2422 bool Match1 = match(BinOp->getOperand(1),
2423 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2424 if (!Match0 && !Match1)
2425 return false;
2426
2427 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2428 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2429 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2430 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2431
2432 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2433 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2434 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2435 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2436 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2437 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2438 return false;
2439
2440 unsigned NumSrcElts = BinOpTy->getNumElements();
2441
2442 // Don't accept shuffles that reference the second operand in
2443 // div/rem or if its an undef arg.
2444 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2445 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2446 return false;
2447
2448 // Merge outer / inner (or identity if no match) shuffles.
2449 SmallVector<int> NewMask0, NewMask1;
2450 for (int M : OuterMask) {
2451 if (M < 0 || M >= (int)NumSrcElts) {
2452 NewMask0.push_back(PoisonMaskElem);
2453 NewMask1.push_back(PoisonMaskElem);
2454 } else {
2455 NewMask0.push_back(Match0 ? Mask0[M] : M);
2456 NewMask1.push_back(Match1 ? Mask1[M] : M);
2457 }
2458 }
2459
2460 unsigned NumOpElts = Op0Ty->getNumElements();
2461 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2462 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2463 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2464 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2465 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2466 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2467
2468 InstructionCost NewCost = 0;
2469 // Try to merge shuffles across the binop if the new shuffles are not costly.
2470 InstructionCost BinOpCost =
2471 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2472 InstructionCost OldCost =
2474 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2475 0, nullptr, {BinOp}, &I);
2476 if (!BinOp->hasOneUse())
2477 NewCost += BinOpCost;
2478
2479 if (Match0) {
2481 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2482 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2483 OldCost += Shuf0Cost;
2484 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2485 NewCost += Shuf0Cost;
2486 }
2487 if (Match1) {
2489 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2490 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2491 OldCost += Shuf1Cost;
2492 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2493 NewCost += Shuf1Cost;
2494 }
2495
2496 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2497
2498 if (!IsIdentity0)
2499 NewCost +=
2501 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2502 if (!IsIdentity1)
2503 NewCost +=
2505 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2506
2507 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2508 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2509 << "\n");
2510
2511 // If costs are equal, still fold as we reduce instruction count.
2512 if (NewCost > OldCost)
2513 return false;
2514
2515 Value *LHS =
2516 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2517 Value *RHS =
2518 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2519 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2520
2521 // Intersect flags from the old binops.
2522 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2523 NewInst->copyIRFlags(BinOp);
2524
2525 Worklist.pushValue(LHS);
2526 Worklist.pushValue(RHS);
2527 replaceValue(I, *NewBO);
2528 return true;
2529}
2530
2531/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2532/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2533bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2534 ArrayRef<int> OldMask;
2535 Instruction *LHS, *RHS;
2537 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2538 return false;
2539
2540 // TODO: Add support for addlike etc.
2541 if (LHS->getOpcode() != RHS->getOpcode())
2542 return false;
2543
2544 Value *X, *Y, *Z, *W;
2545 bool IsCommutative = false;
2546 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2547 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2548 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2549 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2550 auto *BO = cast<BinaryOperator>(LHS);
2551 // Don't introduce poison into div/rem.
2552 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2553 return false;
2554 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2555 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2556 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2557 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2558 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2559 } else
2560 return false;
2561
2562 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2563 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2564 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2565 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2566 return false;
2567
2568 unsigned NumSrcElts = BinOpTy->getNumElements();
2569
2570 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2571 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2572 std::swap(X, Y);
2573
2574 auto ConvertToUnary = [NumSrcElts](int &M) {
2575 if (M >= (int)NumSrcElts)
2576 M -= NumSrcElts;
2577 };
2578
2579 SmallVector<int> NewMask0(OldMask);
2581 if (X == Z) {
2582 llvm::for_each(NewMask0, ConvertToUnary);
2584 Z = PoisonValue::get(BinOpTy);
2585 }
2586
2587 SmallVector<int> NewMask1(OldMask);
2589 if (Y == W) {
2590 llvm::for_each(NewMask1, ConvertToUnary);
2592 W = PoisonValue::get(BinOpTy);
2593 }
2594
2595 // Try to replace a binop with a shuffle if the shuffle is not costly.
2596 InstructionCost OldCost =
2600 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2601 &I);
2602
2603 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2604 // where one use shuffles have gotten split across the binop/cmp. These
2605 // often allow a major reduction in total cost that wouldn't happen as
2606 // individual folds.
2607 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2608 TTI::TargetCostKind CostKind) -> bool {
2609 Value *InnerOp;
2610 ArrayRef<int> InnerMask;
2611 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2612 m_Mask(InnerMask)))) &&
2613 InnerOp->getType() == Op->getType() &&
2614 all_of(InnerMask,
2615 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2616 for (int &M : Mask)
2617 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2618 M = InnerMask[M - Offset];
2619 M = 0 <= M ? M + Offset : M;
2620 }
2622 Op = InnerOp;
2623 return true;
2624 }
2625 return false;
2626 };
2627 bool ReducedInstCount = false;
2628 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2629 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2630 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2631 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2632 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2633 ReducedInstCount |= SingleSrcBinOp;
2634
2635 auto *ShuffleCmpTy =
2636 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2638 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2639 if (!SingleSrcBinOp)
2640 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2641 CostKind, 0, nullptr, {Y, W});
2642
2643 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2644 NewCost +=
2645 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2646 } else {
2647 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2648 ShuffleDstTy, PredLHS, CostKind);
2649 }
2650
2651 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2652 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2653 << "\n");
2654
2655 // If either shuffle will constant fold away, then fold for the same cost as
2656 // we will reduce the instruction count.
2657 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2658 (isa<Constant>(Y) && isa<Constant>(W));
2659 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2660 return false;
2661
2662 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2663 Value *Shuf1 =
2664 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2665 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2666 ? Builder.CreateBinOp(
2667 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2668 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2669
2670 // Intersect flags from the old binops.
2671 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2672 NewInst->copyIRFlags(LHS);
2673 NewInst->andIRFlags(RHS);
2674 }
2675
2676 Worklist.pushValue(Shuf0);
2677 Worklist.pushValue(Shuf1);
2678 replaceValue(I, *NewBO);
2679 return true;
2680}
2681
2682/// Try to convert,
2683/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2684/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2685bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2686 ArrayRef<int> Mask;
2687 Value *C1, *T1, *F1, *C2, *T2, *F2;
2688 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2689 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2690 m_Mask(Mask))))
2691 return false;
2692
2693 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2694 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2695
2696 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2697 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2698 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2699 return false;
2700
2701 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2702 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2703 // SelectInsts must have the same FMF.
2704 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2705 ((SI0FOp != nullptr) &&
2706 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2707 return false;
2708
2709 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2710 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2712 auto SelOp = Instruction::Select;
2713
2715 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2717 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2718
2719 InstructionCost OldCost =
2720 CostSel1 + CostSel2 +
2721 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2722 {I.getOperand(0), I.getOperand(1)}, &I);
2723
2725 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2726 Mask, CostKind, 0, nullptr, {C1, C2});
2727 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2728 nullptr, {T1, T2});
2729 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2730 nullptr, {F1, F2});
2731 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2732 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2733 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2735
2736 if (!Sel1->hasOneUse())
2737 NewCost += CostSel1;
2738 if (!Sel2->hasOneUse())
2739 NewCost += CostSel2;
2740
2741 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2742 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2743 << "\n");
2744 if (NewCost > OldCost)
2745 return false;
2746
2747 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2748 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2749 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2750 Value *NewSel;
2751 // We presuppose that the SelectInsts have the same FMF.
2752 if (SI0FOp)
2753 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2754 SI0FOp->getFastMathFlags());
2755 else
2756 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2757
2758 Worklist.pushValue(ShuffleCmp);
2759 Worklist.pushValue(ShuffleTrue);
2760 Worklist.pushValue(ShuffleFalse);
2761 replaceValue(I, *NewSel);
2762 return true;
2763}
2764
2765/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2766/// into "castop (shuffle)".
2767bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2768 Value *V0, *V1;
2769 ArrayRef<int> OldMask;
2770 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2771 return false;
2772
2773 // Check whether this is a binary shuffle.
2774 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2775
2776 auto *C0 = dyn_cast<CastInst>(V0);
2777 auto *C1 = dyn_cast<CastInst>(V1);
2778 if (!C0 || (IsBinaryShuffle && !C1))
2779 return false;
2780
2781 Instruction::CastOps Opcode = C0->getOpcode();
2782
2783 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2784 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2785 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2786 return false;
2787
2788 if (IsBinaryShuffle) {
2789 if (C0->getSrcTy() != C1->getSrcTy())
2790 return false;
2791 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2792 if (Opcode != C1->getOpcode()) {
2793 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2794 Opcode = Instruction::SExt;
2795 else
2796 return false;
2797 }
2798 }
2799
2800 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2801 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2802 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2803 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2804 return false;
2805
2806 unsigned NumSrcElts = CastSrcTy->getNumElements();
2807 unsigned NumDstElts = CastDstTy->getNumElements();
2808 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2809 "Only bitcasts expected to alter src/dst element counts");
2810
2811 // Check for bitcasting of unscalable vector types.
2812 // e.g. <32 x i40> -> <40 x i32>
2813 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2814 (NumDstElts % NumSrcElts) != 0)
2815 return false;
2816
2817 SmallVector<int, 16> NewMask;
2818 if (NumSrcElts >= NumDstElts) {
2819 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2820 // always be expanded to the equivalent form choosing narrower elements.
2821 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2822 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2823 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2824 } else {
2825 // The bitcast is from narrow elements to wide elements. The shuffle mask
2826 // must choose consecutive elements to allow casting first.
2827 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2828 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2829 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2830 return false;
2831 }
2832
2833 auto *NewShuffleDstTy =
2834 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2835
2836 // Try to replace a castop with a shuffle if the shuffle is not costly.
2837 InstructionCost CostC0 =
2838 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2840
2842 if (IsBinaryShuffle)
2844 else
2846
2847 InstructionCost OldCost = CostC0;
2848 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2849 CostKind, 0, nullptr, {}, &I);
2850
2851 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2852 CastSrcTy, NewMask, CostKind);
2853 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2855 if (!C0->hasOneUse())
2856 NewCost += CostC0;
2857 if (IsBinaryShuffle) {
2858 InstructionCost CostC1 =
2859 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2861 OldCost += CostC1;
2862 if (!C1->hasOneUse())
2863 NewCost += CostC1;
2864 }
2865
2866 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2867 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2868 << "\n");
2869 if (NewCost > OldCost)
2870 return false;
2871
2872 Value *Shuf;
2873 if (IsBinaryShuffle)
2874 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2875 NewMask);
2876 else
2877 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2878
2879 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2880
2881 // Intersect flags from the old casts.
2882 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2883 NewInst->copyIRFlags(C0);
2884 if (IsBinaryShuffle)
2885 NewInst->andIRFlags(C1);
2886 }
2887
2888 Worklist.pushValue(Shuf);
2889 replaceValue(I, *Cast);
2890 return true;
2891}
2892
2893/// Try to convert any of:
2894/// "shuffle (shuffle x, y), (shuffle y, x)"
2895/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2896/// "shuffle (shuffle x, undef), y"
2897/// "shuffle x, (shuffle y, undef)"
2898/// into "shuffle x, y".
2899bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2900 ArrayRef<int> OuterMask;
2901 Value *OuterV0, *OuterV1;
2902 if (!match(&I,
2903 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2904 return false;
2905
2906 ArrayRef<int> InnerMask0, InnerMask1;
2907 Value *X0, *X1, *Y0, *Y1;
2908 bool Match0 =
2909 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2910 bool Match1 =
2911 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2912 if (!Match0 && !Match1)
2913 return false;
2914
2915 // If the outer shuffle is a permute, then create a fake inner all-poison
2916 // shuffle. This is easier than accounting for length-changing shuffles below.
2917 SmallVector<int, 16> PoisonMask1;
2918 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2919 X1 = X0;
2920 Y1 = Y0;
2921 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2922 InnerMask1 = PoisonMask1;
2923 Match1 = true; // fake match
2924 }
2925
2926 X0 = Match0 ? X0 : OuterV0;
2927 Y0 = Match0 ? Y0 : OuterV0;
2928 X1 = Match1 ? X1 : OuterV1;
2929 Y1 = Match1 ? Y1 : OuterV1;
2930 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2931 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2932 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2933 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2934 X0->getType() != X1->getType())
2935 return false;
2936
2937 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2938 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2939
2940 // Attempt to merge shuffles, matching upto 2 source operands.
2941 // Replace index to a poison arg with PoisonMaskElem.
2942 // Bail if either inner masks reference an undef arg.
2943 SmallVector<int, 16> NewMask(OuterMask);
2944 Value *NewX = nullptr, *NewY = nullptr;
2945 for (int &M : NewMask) {
2946 Value *Src = nullptr;
2947 if (0 <= M && M < (int)NumImmElts) {
2948 Src = OuterV0;
2949 if (Match0) {
2950 M = InnerMask0[M];
2951 Src = M >= (int)NumSrcElts ? Y0 : X0;
2952 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2953 }
2954 } else if (M >= (int)NumImmElts) {
2955 Src = OuterV1;
2956 M -= NumImmElts;
2957 if (Match1) {
2958 M = InnerMask1[M];
2959 Src = M >= (int)NumSrcElts ? Y1 : X1;
2960 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2961 }
2962 }
2963 if (Src && M != PoisonMaskElem) {
2964 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2965 if (isa<UndefValue>(Src)) {
2966 // We've referenced an undef element - if its poison, update the shuffle
2967 // mask, else bail.
2968 if (!isa<PoisonValue>(Src))
2969 return false;
2970 M = PoisonMaskElem;
2971 continue;
2972 }
2973 if (!NewX || NewX == Src) {
2974 NewX = Src;
2975 continue;
2976 }
2977 if (!NewY || NewY == Src) {
2978 M += NumSrcElts;
2979 NewY = Src;
2980 continue;
2981 }
2982 return false;
2983 }
2984 }
2985
2986 if (!NewX)
2987 return PoisonValue::get(ShuffleDstTy);
2988 if (!NewY)
2989 NewY = PoisonValue::get(ShuffleSrcTy);
2990
2991 // Have we folded to an Identity shuffle?
2992 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2993 replaceValue(I, *NewX);
2994 return true;
2995 }
2996
2997 // Try to merge the shuffles if the new shuffle is not costly.
2998 InstructionCost InnerCost0 = 0;
2999 if (Match0)
3000 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3001
3002 InstructionCost InnerCost1 = 0;
3003 if (Match1)
3004 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3005
3007
3008 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3009
3010 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3014 InstructionCost NewCost =
3015 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3016 nullptr, {NewX, NewY});
3017 if (!OuterV0->hasOneUse())
3018 NewCost += InnerCost0;
3019 if (!OuterV1->hasOneUse())
3020 NewCost += InnerCost1;
3021
3022 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3023 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3024 << "\n");
3025 if (NewCost > OldCost)
3026 return false;
3027
3028 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3029 replaceValue(I, *Shuf);
3030 return true;
3031}
3032
3033/// Try to convert a chain of length-preserving shuffles that are fed by
3034/// length-changing shuffles from the same source, e.g. a chain of length 3:
3035///
3036/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3037/// (shuffle y, undef)),
3038// (shuffle y, undef)"
3039///
3040/// into a single shuffle fed by a length-changing shuffle:
3041///
3042/// "shuffle x, (shuffle y, undef)"
3043///
3044/// Such chains arise e.g. from folding extract/insert sequences.
3045bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3046 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3047 if (!TrunkType)
3048 return false;
3049
3050 unsigned ChainLength = 0;
3051 SmallVector<int> Mask;
3052 SmallVector<int> YMask;
3053 InstructionCost OldCost = 0;
3054 InstructionCost NewCost = 0;
3055 Value *Trunk = &I;
3056 unsigned NumTrunkElts = TrunkType->getNumElements();
3057 Value *Y = nullptr;
3058
3059 for (;;) {
3060 // Match the current trunk against (commutations of) the pattern
3061 // "shuffle trunk', (shuffle y, undef)"
3062 ArrayRef<int> OuterMask;
3063 Value *OuterV0, *OuterV1;
3064 if (ChainLength != 0 && !Trunk->hasOneUse())
3065 break;
3066 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3067 m_Mask(OuterMask))))
3068 break;
3069 if (OuterV0->getType() != TrunkType) {
3070 // This shuffle is not length-preserving, so it cannot be part of the
3071 // chain.
3072 break;
3073 }
3074
3075 ArrayRef<int> InnerMask0, InnerMask1;
3076 Value *A0, *A1, *B0, *B1;
3077 bool Match0 =
3078 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3079 bool Match1 =
3080 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3081 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3082 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3083 if (Match0Leaf == Match1Leaf) {
3084 // Only handle the case of exactly one leaf in each step. The "two leaves"
3085 // case is handled by foldShuffleOfShuffles.
3086 break;
3087 }
3088
3089 SmallVector<int> CommutedOuterMask;
3090 if (Match0Leaf) {
3091 std::swap(OuterV0, OuterV1);
3092 std::swap(InnerMask0, InnerMask1);
3093 std::swap(A0, A1);
3094 std::swap(B0, B1);
3095 llvm::append_range(CommutedOuterMask, OuterMask);
3096 for (int &M : CommutedOuterMask) {
3097 if (M == PoisonMaskElem)
3098 continue;
3099 if (M < (int)NumTrunkElts)
3100 M += NumTrunkElts;
3101 else
3102 M -= NumTrunkElts;
3103 }
3104 OuterMask = CommutedOuterMask;
3105 }
3106 if (!OuterV1->hasOneUse())
3107 break;
3108
3109 if (!isa<UndefValue>(A1)) {
3110 if (!Y)
3111 Y = A1;
3112 else if (Y != A1)
3113 break;
3114 }
3115 if (!isa<UndefValue>(B1)) {
3116 if (!Y)
3117 Y = B1;
3118 else if (Y != B1)
3119 break;
3120 }
3121
3122 auto *YType = cast<FixedVectorType>(A1->getType());
3123 int NumLeafElts = YType->getNumElements();
3124 SmallVector<int> LocalYMask(InnerMask1);
3125 for (int &M : LocalYMask) {
3126 if (M >= NumLeafElts)
3127 M -= NumLeafElts;
3128 }
3129
3130 InstructionCost LocalOldCost =
3133
3134 // Handle the initial (start of chain) case.
3135 if (!ChainLength) {
3136 Mask.assign(OuterMask);
3137 YMask.assign(LocalYMask);
3138 OldCost = NewCost = LocalOldCost;
3139 Trunk = OuterV0;
3140 ChainLength++;
3141 continue;
3142 }
3143
3144 // For the non-root case, first attempt to combine masks.
3145 SmallVector<int> NewYMask(YMask);
3146 bool Valid = true;
3147 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3148 if (LeafM == -1 || CombinedM == LeafM)
3149 continue;
3150 if (CombinedM == -1) {
3151 CombinedM = LeafM;
3152 } else {
3153 Valid = false;
3154 break;
3155 }
3156 }
3157 if (!Valid)
3158 break;
3159
3160 SmallVector<int> NewMask;
3161 NewMask.reserve(NumTrunkElts);
3162 for (int M : Mask) {
3163 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3164 NewMask.push_back(M);
3165 else
3166 NewMask.push_back(OuterMask[M]);
3167 }
3168
3169 // Break the chain if adding this new step complicates the shuffles such
3170 // that it would increase the new cost by more than the old cost of this
3171 // step.
3172 InstructionCost LocalNewCost =
3174 YType, NewYMask, CostKind) +
3176 TrunkType, NewMask, CostKind);
3177
3178 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3179 break;
3180
3181 LLVM_DEBUG({
3182 if (ChainLength == 1) {
3183 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3184 << I << '\n';
3185 }
3186 dbgs() << " next chain link: " << *Trunk << '\n'
3187 << " old cost: " << (OldCost + LocalOldCost)
3188 << " new cost: " << LocalNewCost << '\n';
3189 });
3190
3191 Mask = NewMask;
3192 YMask = NewYMask;
3193 OldCost += LocalOldCost;
3194 NewCost = LocalNewCost;
3195 Trunk = OuterV0;
3196 ChainLength++;
3197 }
3198 if (ChainLength <= 1)
3199 return false;
3200
3201 if (llvm::all_of(Mask, [&](int M) {
3202 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3203 })) {
3204 // Produce a canonical simplified form if all elements are sourced from Y.
3205 for (int &M : Mask) {
3206 if (M >= static_cast<int>(NumTrunkElts))
3207 M = YMask[M - NumTrunkElts];
3208 }
3209 Value *Root =
3210 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3211 replaceValue(I, *Root);
3212 return true;
3213 }
3214
3215 Value *Leaf =
3216 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3217 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3218 replaceValue(I, *Root);
3219 return true;
3220}
3221
3222/// Try to convert
3223/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3224bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3225 Value *V0, *V1;
3226 ArrayRef<int> OldMask;
3227 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3228 return false;
3229
3230 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3231 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3232 if (!II0 || !II1)
3233 return false;
3234
3235 Intrinsic::ID IID = II0->getIntrinsicID();
3236 if (IID != II1->getIntrinsicID())
3237 return false;
3238 InstructionCost CostII0 =
3239 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3240 InstructionCost CostII1 =
3241 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3242
3243 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3244 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3245 if (!ShuffleDstTy || !II0Ty)
3246 return false;
3247
3248 if (!isTriviallyVectorizable(IID))
3249 return false;
3250
3251 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3253 II0->getArgOperand(I) != II1->getArgOperand(I))
3254 return false;
3255
3256 InstructionCost OldCost =
3257 CostII0 + CostII1 +
3259 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3260
3261 SmallVector<Type *> NewArgsTy;
3262 InstructionCost NewCost = 0;
3263 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3264 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3266 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3267 } else {
3268 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3269 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3270 ShuffleDstTy->getNumElements());
3271 NewArgsTy.push_back(ArgTy);
3272 std::pair<Value *, Value *> OperandPair =
3273 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3274 if (!SeenOperandPairs.insert(OperandPair).second) {
3275 // We've already computed the cost for this operand pair.
3276 continue;
3277 }
3278 NewCost += TTI.getShuffleCost(
3279 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3280 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3281 }
3282 }
3283 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3284
3285 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3286 if (!II0->hasOneUse())
3287 NewCost += CostII0;
3288 if (II1 != II0 && !II1->hasOneUse())
3289 NewCost += CostII1;
3290
3291 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3292 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3293 << "\n");
3294
3295 if (NewCost > OldCost)
3296 return false;
3297
3298 SmallVector<Value *> NewArgs;
3299 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3300 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3302 NewArgs.push_back(II0->getArgOperand(I));
3303 } else {
3304 std::pair<Value *, Value *> OperandPair =
3305 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3306 auto It = ShuffleCache.find(OperandPair);
3307 if (It != ShuffleCache.end()) {
3308 // Reuse previously created shuffle for this operand pair.
3309 NewArgs.push_back(It->second);
3310 continue;
3311 }
3312 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3313 II1->getArgOperand(I), OldMask);
3314 ShuffleCache[OperandPair] = Shuf;
3315 NewArgs.push_back(Shuf);
3316 Worklist.pushValue(Shuf);
3317 }
3318 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3319
3320 // Intersect flags from the old intrinsics.
3321 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3322 NewInst->copyIRFlags(II0);
3323 NewInst->andIRFlags(II1);
3324 }
3325
3326 replaceValue(I, *NewIntrinsic);
3327 return true;
3328}
3329
3330/// Try to convert
3331/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3332bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3333 Value *V0;
3334 ArrayRef<int> Mask;
3335 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3336 return false;
3337
3338 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3339 if (!II0)
3340 return false;
3341
3342 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3343 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3344 if (!ShuffleDstTy || !IntrinsicSrcTy)
3345 return false;
3346
3347 // Validate it's a pure permute, mask should only reference the first vector
3348 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3349 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3350 return false;
3351
3352 Intrinsic::ID IID = II0->getIntrinsicID();
3353 if (!isTriviallyVectorizable(IID))
3354 return false;
3355
3356 // Cost analysis
3358 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3359 InstructionCost OldCost =
3362 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3363
3364 SmallVector<Type *> NewArgsTy;
3365 InstructionCost NewCost = 0;
3366 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3368 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3369 } else {
3370 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3371 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3372 ShuffleDstTy->getNumElements());
3373 NewArgsTy.push_back(ArgTy);
3375 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3376 {II0->getArgOperand(I)});
3377 }
3378 }
3379 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3380 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3381
3382 // If the intrinsic has multiple uses, we need to account for the cost of
3383 // keeping the original intrinsic around.
3384 if (!II0->hasOneUse())
3385 NewCost += IntrinsicCost;
3386
3387 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3388 << OldCost << " vs NewCost: " << NewCost << "\n");
3389
3390 if (NewCost > OldCost)
3391 return false;
3392
3393 // Transform
3394 SmallVector<Value *> NewArgs;
3395 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3397 NewArgs.push_back(II0->getArgOperand(I));
3398 } else {
3399 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3400 NewArgs.push_back(Shuf);
3401 Worklist.pushValue(Shuf);
3402 }
3403 }
3404
3405 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3406
3407 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3408 NewInst->copyIRFlags(II0);
3409
3410 replaceValue(I, *NewIntrinsic);
3411 return true;
3412}
3413
3414using InstLane = std::pair<Use *, int>;
3415
3416static InstLane lookThroughShuffles(Use *U, int Lane) {
3417 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3418 unsigned NumElts =
3419 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3420 int M = SV->getMaskValue(Lane);
3421 if (M < 0)
3422 return {nullptr, PoisonMaskElem};
3423 if (static_cast<unsigned>(M) < NumElts) {
3424 U = &SV->getOperandUse(0);
3425 Lane = M;
3426 } else {
3427 U = &SV->getOperandUse(1);
3428 Lane = M - NumElts;
3429 }
3430 }
3431 return InstLane{U, Lane};
3432}
3433
3437 for (InstLane IL : Item) {
3438 auto [U, Lane] = IL;
3439 InstLane OpLane =
3440 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3441 Lane)
3442 : InstLane{nullptr, PoisonMaskElem};
3443 NItem.emplace_back(OpLane);
3444 }
3445 return NItem;
3446}
3447
3448/// Detect concat of multiple values into a vector
3450 const TargetTransformInfo &TTI) {
3451 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3452 unsigned NumElts = Ty->getNumElements();
3453 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3454 return false;
3455
3456 // Check that the concat is free, usually meaning that the type will be split
3457 // during legalization.
3458 SmallVector<int, 16> ConcatMask(NumElts * 2);
3459 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3460 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3461 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3462 Ty, ConcatMask, CostKind) != 0)
3463 return false;
3464
3465 unsigned NumSlices = Item.size() / NumElts;
3466 // Currently we generate a tree of shuffles for the concats, which limits us
3467 // to a power2.
3468 if (!isPowerOf2_32(NumSlices))
3469 return false;
3470 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3471 Use *SliceV = Item[Slice * NumElts].first;
3472 if (!SliceV || SliceV->get()->getType() != Ty)
3473 return false;
3474 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3475 auto [V, Lane] = Item[Slice * NumElts + Elt];
3476 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3477 return false;
3478 }
3479 }
3480 return true;
3481}
3482
3484 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3485 const SmallPtrSet<Use *, 4> &SplatLeafs,
3486 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3487 IRBuilderBase &Builder,
3488 const TargetTransformInfo *TTI) {
3489 auto [FrontU, FrontLane] = Item.front();
3490
3491 if (IdentityLeafs.contains(FrontU)) {
3492 return FrontU->get();
3493 }
3494 if (SplatLeafs.contains(FrontU)) {
3495 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3496 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3497 }
3498 if (ConcatLeafs.contains(FrontU)) {
3499 unsigned NumElts =
3500 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3501 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3502 for (unsigned S = 0; S < Values.size(); ++S)
3503 Values[S] = Item[S * NumElts].first->get();
3504
3505 while (Values.size() > 1) {
3506 NumElts *= 2;
3507 SmallVector<int, 16> Mask(NumElts, 0);
3508 std::iota(Mask.begin(), Mask.end(), 0);
3509 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3510 for (unsigned S = 0; S < NewValues.size(); ++S)
3511 NewValues[S] =
3512 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3513 Values = NewValues;
3514 }
3515 return Values[0];
3516 }
3517
3518 auto *I = cast<Instruction>(FrontU->get());
3519 auto *II = dyn_cast<IntrinsicInst>(I);
3520 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3522 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3523 if (II &&
3524 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3525 Ops[Idx] = II->getOperand(Idx);
3526 continue;
3527 }
3529 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3530 Builder, TTI);
3531 }
3532
3533 SmallVector<Value *, 8> ValueList;
3534 for (const auto &Lane : Item)
3535 if (Lane.first)
3536 ValueList.push_back(Lane.first->get());
3537
3538 Type *DstTy =
3539 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3540 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3541 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3542 Ops[0], Ops[1]);
3543 propagateIRFlags(Value, ValueList);
3544 return Value;
3545 }
3546 if (auto *CI = dyn_cast<CmpInst>(I)) {
3547 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3548 propagateIRFlags(Value, ValueList);
3549 return Value;
3550 }
3551 if (auto *SI = dyn_cast<SelectInst>(I)) {
3552 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3553 propagateIRFlags(Value, ValueList);
3554 return Value;
3555 }
3556 if (auto *CI = dyn_cast<CastInst>(I)) {
3557 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3558 propagateIRFlags(Value, ValueList);
3559 return Value;
3560 }
3561 if (II) {
3562 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3563 propagateIRFlags(Value, ValueList);
3564 return Value;
3565 }
3566 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3567 auto *Value =
3568 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3569 propagateIRFlags(Value, ValueList);
3570 return Value;
3571}
3572
3573// Starting from a shuffle, look up through operands tracking the shuffled index
3574// of each lane. If we can simplify away the shuffles to identities then
3575// do so.
3576bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3577 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3578 if (!Ty || I.use_empty())
3579 return false;
3580
3581 SmallVector<InstLane> Start(Ty->getNumElements());
3582 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3583 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3584
3586 Worklist.push_back(Start);
3587 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3588 unsigned NumVisited = 0;
3589
3590 while (!Worklist.empty()) {
3591 if (++NumVisited > MaxInstrsToScan)
3592 return false;
3593
3594 SmallVector<InstLane> Item = Worklist.pop_back_val();
3595 auto [FrontU, FrontLane] = Item.front();
3596
3597 // If we found an undef first lane then bail out to keep things simple.
3598 if (!FrontU)
3599 return false;
3600
3601 // Helper to peek through bitcasts to the same value.
3602 auto IsEquiv = [&](Value *X, Value *Y) {
3603 return X->getType() == Y->getType() &&
3605 };
3606
3607 // Look for an identity value.
3608 if (FrontLane == 0 &&
3609 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3610 Ty->getNumElements() &&
3611 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3612 Value *FrontV = Item.front().first->get();
3613 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3614 E.value().second == (int)E.index());
3615 })) {
3616 IdentityLeafs.insert(FrontU);
3617 continue;
3618 }
3619 // Look for constants, for the moment only supporting constant splats.
3620 if (auto *C = dyn_cast<Constant>(FrontU);
3621 C && C->getSplatValue() &&
3622 all_of(drop_begin(Item), [Item](InstLane &IL) {
3623 Value *FrontV = Item.front().first->get();
3624 Use *U = IL.first;
3625 return !U || (isa<Constant>(U->get()) &&
3626 cast<Constant>(U->get())->getSplatValue() ==
3627 cast<Constant>(FrontV)->getSplatValue());
3628 })) {
3629 SplatLeafs.insert(FrontU);
3630 continue;
3631 }
3632 // Look for a splat value.
3633 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3634 auto [FrontU, FrontLane] = Item.front();
3635 auto [U, Lane] = IL;
3636 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3637 })) {
3638 SplatLeafs.insert(FrontU);
3639 continue;
3640 }
3641
3642 // We need each element to be the same type of value, and check that each
3643 // element has a single use.
3644 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3645 Value *FrontV = Item.front().first->get();
3646 if (!IL.first)
3647 return true;
3648 Value *V = IL.first->get();
3649 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3650 return false;
3651 if (V->getValueID() != FrontV->getValueID())
3652 return false;
3653 if (auto *CI = dyn_cast<CmpInst>(V))
3654 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3655 return false;
3656 if (auto *CI = dyn_cast<CastInst>(V))
3657 if (CI->getSrcTy()->getScalarType() !=
3658 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3659 return false;
3660 if (auto *SI = dyn_cast<SelectInst>(V))
3661 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3662 SI->getOperand(0)->getType() !=
3663 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3664 return false;
3665 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3666 return false;
3667 auto *II = dyn_cast<IntrinsicInst>(V);
3668 return !II || (isa<IntrinsicInst>(FrontV) &&
3669 II->getIntrinsicID() ==
3670 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3671 !II->hasOperandBundles());
3672 };
3673 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3674 // Check the operator is one that we support.
3675 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3676 // We exclude div/rem in case they hit UB from poison lanes.
3677 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3678 BO && BO->isIntDivRem())
3679 return false;
3682 continue;
3683 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3684 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3686 continue;
3687 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3688 // TODO: Handle vector widening/narrowing bitcasts.
3689 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3690 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3691 if (DstTy && SrcTy &&
3692 SrcTy->getNumElements() == DstTy->getNumElements()) {
3694 continue;
3695 }
3696 } else if (isa<SelectInst>(FrontU)) {
3700 continue;
3701 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3702 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3703 !II->hasOperandBundles()) {
3704 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3705 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3706 &TTI)) {
3707 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3708 Value *FrontV = Item.front().first->get();
3709 Use *U = IL.first;
3710 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3711 cast<Instruction>(FrontV)->getOperand(Op));
3712 }))
3713 return false;
3714 continue;
3715 }
3717 }
3718 continue;
3719 }
3720 }
3721
3722 if (isFreeConcat(Item, CostKind, TTI)) {
3723 ConcatLeafs.insert(FrontU);
3724 continue;
3725 }
3726
3727 return false;
3728 }
3729
3730 if (NumVisited <= 1)
3731 return false;
3732
3733 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3734
3735 // If we got this far, we know the shuffles are superfluous and can be
3736 // removed. Scan through again and generate the new tree of instructions.
3737 Builder.SetInsertPoint(&I);
3738 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3739 ConcatLeafs, Builder, &TTI);
3740 replaceValue(I, *V);
3741 return true;
3742}
3743
3744/// Given a commutative reduction, the order of the input lanes does not alter
3745/// the results. We can use this to remove certain shuffles feeding the
3746/// reduction, removing the need to shuffle at all.
3747bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3748 auto *II = dyn_cast<IntrinsicInst>(&I);
3749 if (!II)
3750 return false;
3751 switch (II->getIntrinsicID()) {
3752 case Intrinsic::vector_reduce_add:
3753 case Intrinsic::vector_reduce_mul:
3754 case Intrinsic::vector_reduce_and:
3755 case Intrinsic::vector_reduce_or:
3756 case Intrinsic::vector_reduce_xor:
3757 case Intrinsic::vector_reduce_smin:
3758 case Intrinsic::vector_reduce_smax:
3759 case Intrinsic::vector_reduce_umin:
3760 case Intrinsic::vector_reduce_umax:
3761 break;
3762 default:
3763 return false;
3764 }
3765
3766 // Find all the inputs when looking through operations that do not alter the
3767 // lane order (binops, for example). Currently we look for a single shuffle,
3768 // and can ignore splat values.
3769 std::queue<Value *> Worklist;
3770 SmallPtrSet<Value *, 4> Visited;
3771 ShuffleVectorInst *Shuffle = nullptr;
3772 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3773 Worklist.push(Op);
3774
3775 while (!Worklist.empty()) {
3776 Value *CV = Worklist.front();
3777 Worklist.pop();
3778 if (Visited.contains(CV))
3779 continue;
3780
3781 // Splats don't change the order, so can be safely ignored.
3782 if (isSplatValue(CV))
3783 continue;
3784
3785 Visited.insert(CV);
3786
3787 if (auto *CI = dyn_cast<Instruction>(CV)) {
3788 if (CI->isBinaryOp()) {
3789 for (auto *Op : CI->operand_values())
3790 Worklist.push(Op);
3791 continue;
3792 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3793 if (Shuffle && Shuffle != SV)
3794 return false;
3795 Shuffle = SV;
3796 continue;
3797 }
3798 }
3799
3800 // Anything else is currently an unknown node.
3801 return false;
3802 }
3803
3804 if (!Shuffle)
3805 return false;
3806
3807 // Check all uses of the binary ops and shuffles are also included in the
3808 // lane-invariant operations (Visited should be the list of lanewise
3809 // instructions, including the shuffle that we found).
3810 for (auto *V : Visited)
3811 for (auto *U : V->users())
3812 if (!Visited.contains(U) && U != &I)
3813 return false;
3814
3815 FixedVectorType *VecType =
3816 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3817 if (!VecType)
3818 return false;
3819 FixedVectorType *ShuffleInputType =
3821 if (!ShuffleInputType)
3822 return false;
3823 unsigned NumInputElts = ShuffleInputType->getNumElements();
3824
3825 // Find the mask from sorting the lanes into order. This is most likely to
3826 // become a identity or concat mask. Undef elements are pushed to the end.
3827 SmallVector<int> ConcatMask;
3828 Shuffle->getShuffleMask(ConcatMask);
3829 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3830 bool UsesSecondVec =
3831 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3832
3834 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3835 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3837 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3838 ShuffleInputType, ConcatMask, CostKind);
3839
3840 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3841 << "\n");
3842 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3843 << "\n");
3844 bool MadeChanges = false;
3845 if (NewCost < OldCost) {
3846 Builder.SetInsertPoint(Shuffle);
3847 Value *NewShuffle = Builder.CreateShuffleVector(
3848 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3849 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3850 replaceValue(*Shuffle, *NewShuffle);
3851 return true;
3852 }
3853
3854 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3855 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3856 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3857 return MadeChanges;
3858}
3859
3860/// For a given chain of patterns of the following form:
3861///
3862/// ```
3863/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3864///
3865/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3866/// ty1> %1)
3867/// OR
3868/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3869///
3870/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3871/// ...
3872/// ...
3873/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3874/// 3), <n x ty1> %(i - 2)
3875/// OR
3876/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3877///
3878/// %(i) = extractelement <n x ty1> %(i - 1), 0
3879/// ```
3880///
3881/// Where:
3882/// `mask` follows a partition pattern:
3883///
3884/// Ex:
3885/// [n = 8, p = poison]
3886///
3887/// 4 5 6 7 | p p p p
3888/// 2 3 | p p p p p p
3889/// 1 | p p p p p p p
3890///
3891/// For powers of 2, there's a consistent pattern, but for other cases
3892/// the parity of the current half value at each step decides the
3893/// next partition half (see `ExpectedParityMask` for more logical details
3894/// in generalising this).
3895///
3896/// Ex:
3897/// [n = 6]
3898///
3899/// 3 4 5 | p p p
3900/// 1 2 | p p p p
3901/// 1 | p p p p p
3902bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3903 // Going bottom-up for the pattern.
3904 std::queue<Value *> InstWorklist;
3905 InstructionCost OrigCost = 0;
3906
3907 // Common instruction operation after each shuffle op.
3908 std::optional<unsigned int> CommonCallOp = std::nullopt;
3909 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3910
3911 bool IsFirstCallOrBinInst = true;
3912 bool ShouldBeCallOrBinInst = true;
3913
3914 // This stores the last used instructions for shuffle/common op.
3915 //
3916 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3917 // instructions from either shuffle/common op.
3918 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3919
3920 Value *VecOpEE;
3921 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3922 return false;
3923
3924 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3925 if (!FVT)
3926 return false;
3927
3928 int64_t VecSize = FVT->getNumElements();
3929 if (VecSize < 2)
3930 return false;
3931
3932 // Number of levels would be ~log2(n), considering we always partition
3933 // by half for this fold pattern.
3934 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3935 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3936
3937 // This is how we generalise for all element sizes.
3938 // At each step, if vector size is odd, we need non-poison
3939 // values to cover the dominant half so we don't miss out on any element.
3940 //
3941 // This mask will help us retrieve this as we go from bottom to top:
3942 //
3943 // Mask Set -> N = N * 2 - 1
3944 // Mask Unset -> N = N * 2
3945 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3946 Cur = (Cur + 1) / 2, --Mask) {
3947 if (Cur & 1)
3948 ExpectedParityMask |= (1ll << Mask);
3949 }
3950
3951 InstWorklist.push(VecOpEE);
3952
3953 while (!InstWorklist.empty()) {
3954 Value *CI = InstWorklist.front();
3955 InstWorklist.pop();
3956
3957 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3958 if (!ShouldBeCallOrBinInst)
3959 return false;
3960
3961 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
3962 return false;
3963
3964 // For the first found call/bin op, the vector has to come from the
3965 // extract element op.
3966 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3967 return false;
3968 IsFirstCallOrBinInst = false;
3969
3970 if (!CommonCallOp)
3971 CommonCallOp = II->getIntrinsicID();
3972 if (II->getIntrinsicID() != *CommonCallOp)
3973 return false;
3974
3975 switch (II->getIntrinsicID()) {
3976 case Intrinsic::umin:
3977 case Intrinsic::umax:
3978 case Intrinsic::smin:
3979 case Intrinsic::smax: {
3980 auto *Op0 = II->getOperand(0);
3981 auto *Op1 = II->getOperand(1);
3982 PrevVecV[0] = Op0;
3983 PrevVecV[1] = Op1;
3984 break;
3985 }
3986 default:
3987 return false;
3988 }
3989 ShouldBeCallOrBinInst ^= 1;
3990
3991 IntrinsicCostAttributes ICA(
3992 *CommonCallOp, II->getType(),
3993 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3994 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3995
3996 // We may need a swap here since it can be (a, b) or (b, a)
3997 // and accordingly change as we go up.
3998 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3999 std::swap(PrevVecV[0], PrevVecV[1]);
4000 InstWorklist.push(PrevVecV[1]);
4001 InstWorklist.push(PrevVecV[0]);
4002 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
4003 // Similar logic for bin ops.
4004
4005 if (!ShouldBeCallOrBinInst)
4006 return false;
4007
4008 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4009 return false;
4010
4011 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4012 return false;
4013 IsFirstCallOrBinInst = false;
4014
4015 if (!CommonBinOp)
4016 CommonBinOp = BinOp->getOpcode();
4017
4018 if (BinOp->getOpcode() != *CommonBinOp)
4019 return false;
4020
4021 switch (*CommonBinOp) {
4022 case BinaryOperator::Add:
4023 case BinaryOperator::Mul:
4024 case BinaryOperator::Or:
4025 case BinaryOperator::And:
4026 case BinaryOperator::Xor: {
4027 auto *Op0 = BinOp->getOperand(0);
4028 auto *Op1 = BinOp->getOperand(1);
4029 PrevVecV[0] = Op0;
4030 PrevVecV[1] = Op1;
4031 break;
4032 }
4033 default:
4034 return false;
4035 }
4036 ShouldBeCallOrBinInst ^= 1;
4037
4038 OrigCost +=
4039 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
4040
4041 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4042 std::swap(PrevVecV[0], PrevVecV[1]);
4043 InstWorklist.push(PrevVecV[1]);
4044 InstWorklist.push(PrevVecV[0]);
4045 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
4046 // We shouldn't have any null values in the previous vectors,
4047 // is so, there was a mismatch in pattern.
4048 if (ShouldBeCallOrBinInst || any_of(PrevVecV, equal_to(nullptr)))
4049 return false;
4050
4051 if (SVInst != PrevVecV[1])
4052 return false;
4053
4054 ArrayRef<int> CurMask;
4055 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
4056 m_Mask(CurMask))))
4057 return false;
4058
4059 // Subtract the parity mask when checking the condition.
4060 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
4061 if (Mask < ShuffleMaskHalf &&
4062 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
4063 return false;
4064 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
4065 return false;
4066 }
4067
4068 // Update mask values.
4069 ShuffleMaskHalf *= 2;
4070 ShuffleMaskHalf -= (ExpectedParityMask & 1);
4071 ExpectedParityMask >>= 1;
4072
4074 SVInst->getType(), SVInst->getType(),
4075 CurMask, CostKind);
4076
4077 VisitedCnt += 1;
4078 if (!ExpectedParityMask && VisitedCnt == NumLevels)
4079 break;
4080
4081 ShouldBeCallOrBinInst ^= 1;
4082 } else {
4083 return false;
4084 }
4085 }
4086
4087 // Pattern should end with a shuffle op.
4088 if (ShouldBeCallOrBinInst)
4089 return false;
4090
4091 assert(VecSize != -1 && "Expected Match for Vector Size");
4092
4093 Value *FinalVecV = PrevVecV[0];
4094 if (!FinalVecV)
4095 return false;
4096
4097 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
4098
4099 Intrinsic::ID ReducedOp =
4100 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4101 : getReductionForBinop(*CommonBinOp));
4102 if (!ReducedOp)
4103 return false;
4104
4105 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
4107
4108 if (NewCost >= OrigCost)
4109 return false;
4110
4111 auto *ReducedResult =
4112 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
4113 replaceValue(I, *ReducedResult);
4114
4115 return true;
4116}
4117
4118/// Determine if its more efficient to fold:
4119/// reduce(trunc(x)) -> trunc(reduce(x)).
4120/// reduce(sext(x)) -> sext(reduce(x)).
4121/// reduce(zext(x)) -> zext(reduce(x)).
4122bool VectorCombine::foldCastFromReductions(Instruction &I) {
4123 auto *II = dyn_cast<IntrinsicInst>(&I);
4124 if (!II)
4125 return false;
4126
4127 bool TruncOnly = false;
4128 Intrinsic::ID IID = II->getIntrinsicID();
4129 switch (IID) {
4130 case Intrinsic::vector_reduce_add:
4131 case Intrinsic::vector_reduce_mul:
4132 TruncOnly = true;
4133 break;
4134 case Intrinsic::vector_reduce_and:
4135 case Intrinsic::vector_reduce_or:
4136 case Intrinsic::vector_reduce_xor:
4137 break;
4138 default:
4139 return false;
4140 }
4141
4142 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4143 Value *ReductionSrc = I.getOperand(0);
4144
4145 Value *Src;
4146 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4147 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4148 return false;
4149
4150 auto CastOpc =
4151 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4152
4153 auto *SrcTy = cast<VectorType>(Src->getType());
4154 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4155 Type *ResultTy = I.getType();
4156
4158 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4159 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4161 cast<CastInst>(ReductionSrc));
4162 InstructionCost NewCost =
4163 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4164 CostKind) +
4165 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4167
4168 if (OldCost <= NewCost || !NewCost.isValid())
4169 return false;
4170
4171 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4172 II->getIntrinsicID(), {Src});
4173 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4174 replaceValue(I, *NewCast);
4175 return true;
4176}
4177
4178/// Returns true if this ShuffleVectorInst eventually feeds into a
4179/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4180/// chains of shuffles and binary operators (in any combination/order).
4181/// The search does not go deeper than the given Depth.
4183 constexpr unsigned MaxVisited = 32;
4186 bool FoundReduction = false;
4187
4188 WorkList.push_back(SVI);
4189 while (!WorkList.empty()) {
4190 Instruction *I = WorkList.pop_back_val();
4191 for (User *U : I->users()) {
4192 auto *UI = cast<Instruction>(U);
4193 if (!UI || !Visited.insert(UI).second)
4194 continue;
4195 if (Visited.size() > MaxVisited)
4196 return false;
4197 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4198 // More than one reduction reached
4199 if (FoundReduction)
4200 return false;
4201 switch (II->getIntrinsicID()) {
4202 case Intrinsic::vector_reduce_add:
4203 case Intrinsic::vector_reduce_mul:
4204 case Intrinsic::vector_reduce_and:
4205 case Intrinsic::vector_reduce_or:
4206 case Intrinsic::vector_reduce_xor:
4207 case Intrinsic::vector_reduce_smin:
4208 case Intrinsic::vector_reduce_smax:
4209 case Intrinsic::vector_reduce_umin:
4210 case Intrinsic::vector_reduce_umax:
4211 FoundReduction = true;
4212 continue;
4213 default:
4214 return false;
4215 }
4216 }
4217
4219 return false;
4220
4221 WorkList.emplace_back(UI);
4222 }
4223 }
4224 return FoundReduction;
4225}
4226
4227/// This method looks for groups of shuffles acting on binops, of the form:
4228/// %x = shuffle ...
4229/// %y = shuffle ...
4230/// %a = binop %x, %y
4231/// %b = binop %x, %y
4232/// shuffle %a, %b, selectmask
4233/// We may, especially if the shuffle is wider than legal, be able to convert
4234/// the shuffle to a form where only parts of a and b need to be computed. On
4235/// architectures with no obvious "select" shuffle, this can reduce the total
4236/// number of operations if the target reports them as cheaper.
4237bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4238 auto *SVI = cast<ShuffleVectorInst>(&I);
4239 auto *VT = cast<FixedVectorType>(I.getType());
4240 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4241 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4242 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4243 VT != Op0->getType())
4244 return false;
4245
4246 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4247 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4248 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4249 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4250 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4251 auto checkSVNonOpUses = [&](Instruction *I) {
4252 if (!I || I->getOperand(0)->getType() != VT)
4253 return true;
4254 return any_of(I->users(), [&](User *U) {
4255 return U != Op0 && U != Op1 &&
4256 !(isa<ShuffleVectorInst>(U) &&
4257 (InputShuffles.contains(cast<Instruction>(U)) ||
4258 isInstructionTriviallyDead(cast<Instruction>(U))));
4259 });
4260 };
4261 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4262 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4263 return false;
4264
4265 // Collect all the uses that are shuffles that we can transform together. We
4266 // may not have a single shuffle, but a group that can all be transformed
4267 // together profitably.
4269 auto collectShuffles = [&](Instruction *I) {
4270 for (auto *U : I->users()) {
4271 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4272 if (!SV || SV->getType() != VT)
4273 return false;
4274 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4275 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4276 return false;
4277 if (!llvm::is_contained(Shuffles, SV))
4278 Shuffles.push_back(SV);
4279 }
4280 return true;
4281 };
4282 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4283 return false;
4284 // From a reduction, we need to be processing a single shuffle, otherwise the
4285 // other uses will not be lane-invariant.
4286 if (FromReduction && Shuffles.size() > 1)
4287 return false;
4288
4289 // Add any shuffle uses for the shuffles we have found, to include them in our
4290 // cost calculations.
4291 if (!FromReduction) {
4292 for (ShuffleVectorInst *SV : Shuffles) {
4293 for (auto *U : SV->users()) {
4294 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4295 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4296 Shuffles.push_back(SSV);
4297 }
4298 }
4299 }
4300
4301 // For each of the output shuffles, we try to sort all the first vector
4302 // elements to the beginning, followed by the second array elements at the
4303 // end. If the binops are legalized to smaller vectors, this may reduce total
4304 // number of binops. We compute the ReconstructMask mask needed to convert
4305 // back to the original lane order.
4307 SmallVector<SmallVector<int>> OrigReconstructMasks;
4308 int MaxV1Elt = 0, MaxV2Elt = 0;
4309 unsigned NumElts = VT->getNumElements();
4310 for (ShuffleVectorInst *SVN : Shuffles) {
4311 SmallVector<int> Mask;
4312 SVN->getShuffleMask(Mask);
4313
4314 // Check the operands are the same as the original, or reversed (in which
4315 // case we need to commute the mask).
4316 Value *SVOp0 = SVN->getOperand(0);
4317 Value *SVOp1 = SVN->getOperand(1);
4318 if (isa<UndefValue>(SVOp1)) {
4319 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4320 SVOp0 = SSV->getOperand(0);
4321 SVOp1 = SSV->getOperand(1);
4322 for (int &Elem : Mask) {
4323 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4324 return false;
4325 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
4326 }
4327 }
4328 if (SVOp0 == Op1 && SVOp1 == Op0) {
4329 std::swap(SVOp0, SVOp1);
4331 }
4332 if (SVOp0 != Op0 || SVOp1 != Op1)
4333 return false;
4334
4335 // Calculate the reconstruction mask for this shuffle, as the mask needed to
4336 // take the packed values from Op0/Op1 and reconstructing to the original
4337 // order.
4338 SmallVector<int> ReconstructMask;
4339 for (unsigned I = 0; I < Mask.size(); I++) {
4340 if (Mask[I] < 0) {
4341 ReconstructMask.push_back(-1);
4342 } else if (Mask[I] < static_cast<int>(NumElts)) {
4343 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
4344 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
4345 return Mask[I] == A.first;
4346 });
4347 if (It != V1.end())
4348 ReconstructMask.push_back(It - V1.begin());
4349 else {
4350 ReconstructMask.push_back(V1.size());
4351 V1.emplace_back(Mask[I], V1.size());
4352 }
4353 } else {
4354 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
4355 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
4356 return Mask[I] - static_cast<int>(NumElts) == A.first;
4357 });
4358 if (It != V2.end())
4359 ReconstructMask.push_back(NumElts + It - V2.begin());
4360 else {
4361 ReconstructMask.push_back(NumElts + V2.size());
4362 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
4363 }
4364 }
4365 }
4366
4367 // For reductions, we know that the lane ordering out doesn't alter the
4368 // result. In-order can help simplify the shuffle away.
4369 if (FromReduction)
4370 sort(ReconstructMask);
4371 OrigReconstructMasks.push_back(std::move(ReconstructMask));
4372 }
4373
4374 // If the Maximum element used from V1 and V2 are not larger than the new
4375 // vectors, the vectors are already packes and performing the optimization
4376 // again will likely not help any further. This also prevents us from getting
4377 // stuck in a cycle in case the costs do not also rule it out.
4378 if (V1.empty() || V2.empty() ||
4379 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
4380 MaxV2Elt == static_cast<int>(V2.size()) - 1))
4381 return false;
4382
4383 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
4384 // shuffle of another shuffle, or not a shuffle (that is treated like a
4385 // identity shuffle).
4386 auto GetBaseMaskValue = [&](Instruction *I, int M) {
4387 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4388 if (!SV)
4389 return M;
4390 if (isa<UndefValue>(SV->getOperand(1)))
4391 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4392 if (InputShuffles.contains(SSV))
4393 return SSV->getMaskValue(SV->getMaskValue(M));
4394 return SV->getMaskValue(M);
4395 };
4396
4397 // Attempt to sort the inputs my ascending mask values to make simpler input
4398 // shuffles and push complex shuffles down to the uses. We sort on the first
4399 // of the two input shuffle orders, to try and get at least one input into a
4400 // nice order.
4401 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
4402 std::pair<int, int> Y) {
4403 int MXA = GetBaseMaskValue(A, X.first);
4404 int MYA = GetBaseMaskValue(A, Y.first);
4405 return MXA < MYA;
4406 };
4407 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
4408 return SortBase(SVI0A, A, B);
4409 });
4410 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
4411 return SortBase(SVI1A, A, B);
4412 });
4413 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
4414 // modified order of the input shuffles.
4415 SmallVector<SmallVector<int>> ReconstructMasks;
4416 for (const auto &Mask : OrigReconstructMasks) {
4417 SmallVector<int> ReconstructMask;
4418 for (int M : Mask) {
4419 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
4420 auto It = find_if(V, [M](auto A) { return A.second == M; });
4421 assert(It != V.end() && "Expected all entries in Mask");
4422 return std::distance(V.begin(), It);
4423 };
4424 if (M < 0)
4425 ReconstructMask.push_back(-1);
4426 else if (M < static_cast<int>(NumElts)) {
4427 ReconstructMask.push_back(FindIndex(V1, M));
4428 } else {
4429 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
4430 }
4431 }
4432 ReconstructMasks.push_back(std::move(ReconstructMask));
4433 }
4434
4435 // Calculate the masks needed for the new input shuffles, which get padded
4436 // with undef
4437 SmallVector<int> V1A, V1B, V2A, V2B;
4438 for (unsigned I = 0; I < V1.size(); I++) {
4439 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
4440 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
4441 }
4442 for (unsigned I = 0; I < V2.size(); I++) {
4443 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
4444 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
4445 }
4446 while (V1A.size() < NumElts) {
4449 }
4450 while (V2A.size() < NumElts) {
4453 }
4454
4455 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4456 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4457 if (!SV)
4458 return C;
4459 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4462 VT, VT, SV->getShuffleMask(), CostKind);
4463 };
4464 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4465 return C +
4467 };
4468
4469 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4470 unsigned MaxVectorSize =
4472 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4473 if (MaxElementsInVector == 0)
4474 return false;
4475 // When there are multiple shufflevector operations on the same input,
4476 // especially when the vector length is larger than the register size,
4477 // identical shuffle patterns may occur across different groups of elements.
4478 // To avoid overestimating the cost by counting these repeated shuffles more
4479 // than once, we only account for unique shuffle patterns. This adjustment
4480 // prevents inflated costs in the cost model for wide vectors split into
4481 // several register-sized groups.
4482 std::set<SmallVector<int, 4>> UniqueShuffles;
4483 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4484 // Compute the cost for performing the shuffle over the full vector.
4485 auto ShuffleCost =
4487 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4488 if (NumFullVectors < 2)
4489 return C + ShuffleCost;
4490 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4491 unsigned NumUniqueGroups = 0;
4492 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4493 // For each group of MaxElementsInVector contiguous elements,
4494 // collect their shuffle pattern and insert into the set of unique patterns.
4495 for (unsigned I = 0; I < NumFullVectors; ++I) {
4496 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4497 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4498 if (UniqueShuffles.insert(SubShuffle).second)
4499 NumUniqueGroups += 1;
4500 }
4501 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4502 };
4503 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4504 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4505 if (!SV)
4506 return C;
4507 SmallVector<int, 16> Mask;
4508 SV->getShuffleMask(Mask);
4509 return AddShuffleMaskAdjustedCost(C, Mask);
4510 };
4511 // Check that input consists of ShuffleVectors applied to the same input
4512 auto AllShufflesHaveSameOperands =
4513 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4514 if (InputShuffles.size() < 2)
4515 return false;
4516 ShuffleVectorInst *FirstSV =
4517 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4518 if (!FirstSV)
4519 return false;
4520
4521 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4522 return std::all_of(
4523 std::next(InputShuffles.begin()), InputShuffles.end(),
4524 [&](Instruction *I) {
4525 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4526 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4527 });
4528 };
4529
4530 // Get the costs of the shuffles + binops before and after with the new
4531 // shuffle masks.
4532 InstructionCost CostBefore =
4533 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4534 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4535 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4536 InstructionCost(0), AddShuffleCost);
4537 if (AllShufflesHaveSameOperands(InputShuffles)) {
4538 UniqueShuffles.clear();
4539 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4540 InstructionCost(0), AddShuffleAdjustedCost);
4541 } else {
4542 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4543 InstructionCost(0), AddShuffleCost);
4544 }
4545
4546 // The new binops will be unused for lanes past the used shuffle lengths.
4547 // These types attempt to get the correct cost for that from the target.
4548 FixedVectorType *Op0SmallVT =
4549 FixedVectorType::get(VT->getScalarType(), V1.size());
4550 FixedVectorType *Op1SmallVT =
4551 FixedVectorType::get(VT->getScalarType(), V2.size());
4552 InstructionCost CostAfter =
4553 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4554 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4555 UniqueShuffles.clear();
4556 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4557 InstructionCost(0), AddShuffleMaskAdjustedCost);
4558 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4559 CostAfter +=
4560 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4561 InstructionCost(0), AddShuffleMaskCost);
4562
4563 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4564 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4565 << " vs CostAfter: " << CostAfter << "\n");
4566 if (CostBefore < CostAfter ||
4567 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4568 return false;
4569
4570 // The cost model has passed, create the new instructions.
4571 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4572 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4573 if (!SV)
4574 return I;
4575 if (isa<UndefValue>(SV->getOperand(1)))
4576 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4577 if (InputShuffles.contains(SSV))
4578 return SSV->getOperand(Op);
4579 return SV->getOperand(Op);
4580 };
4581 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4582 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4583 GetShuffleOperand(SVI0A, 1), V1A);
4584 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4585 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4586 GetShuffleOperand(SVI0B, 1), V1B);
4587 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4588 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4589 GetShuffleOperand(SVI1A, 1), V2A);
4590 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4591 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4592 GetShuffleOperand(SVI1B, 1), V2B);
4593 Builder.SetInsertPoint(Op0);
4594 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4595 NSV0A, NSV0B);
4596 if (auto *I = dyn_cast<Instruction>(NOp0))
4597 I->copyIRFlags(Op0, true);
4598 Builder.SetInsertPoint(Op1);
4599 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4600 NSV1A, NSV1B);
4601 if (auto *I = dyn_cast<Instruction>(NOp1))
4602 I->copyIRFlags(Op1, true);
4603
4604 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4605 Builder.SetInsertPoint(Shuffles[S]);
4606 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4607 replaceValue(*Shuffles[S], *NSV, false);
4608 }
4609
4610 Worklist.pushValue(NSV0A);
4611 Worklist.pushValue(NSV0B);
4612 Worklist.pushValue(NSV1A);
4613 Worklist.pushValue(NSV1B);
4614 return true;
4615}
4616
4617/// Check if instruction depends on ZExt and this ZExt can be moved after the
4618/// instruction. Move ZExt if it is profitable. For example:
4619/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4620/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4621/// Cost model calculations takes into account if zext(x) has other users and
4622/// whether it can be propagated through them too.
4623bool VectorCombine::shrinkType(Instruction &I) {
4624 Value *ZExted, *OtherOperand;
4625 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4626 m_Value(OtherOperand))) &&
4627 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4628 return false;
4629
4630 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4631
4632 auto *BigTy = cast<FixedVectorType>(I.getType());
4633 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4634 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4635
4636 if (I.getOpcode() == Instruction::LShr) {
4637 // Check that the shift amount is less than the number of bits in the
4638 // smaller type. Otherwise, the smaller lshr will return a poison value.
4639 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4640 if (ShAmtKB.getMaxValue().uge(BW))
4641 return false;
4642 } else {
4643 // Check that the expression overall uses at most the same number of bits as
4644 // ZExted
4645 KnownBits KB = computeKnownBits(&I, *DL);
4646 if (KB.countMaxActiveBits() > BW)
4647 return false;
4648 }
4649
4650 // Calculate costs of leaving current IR as it is and moving ZExt operation
4651 // later, along with adding truncates if needed
4653 Instruction::ZExt, BigTy, SmallTy,
4654 TargetTransformInfo::CastContextHint::None, CostKind);
4655 InstructionCost CurrentCost = ZExtCost;
4656 InstructionCost ShrinkCost = 0;
4657
4658 // Calculate total cost and check that we can propagate through all ZExt users
4659 for (User *U : ZExtOperand->users()) {
4660 auto *UI = cast<Instruction>(U);
4661 if (UI == &I) {
4662 CurrentCost +=
4663 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4664 ShrinkCost +=
4665 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4666 ShrinkCost += ZExtCost;
4667 continue;
4668 }
4669
4670 if (!Instruction::isBinaryOp(UI->getOpcode()))
4671 return false;
4672
4673 // Check if we can propagate ZExt through its other users
4674 KnownBits KB = computeKnownBits(UI, *DL);
4675 if (KB.countMaxActiveBits() > BW)
4676 return false;
4677
4678 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4679 ShrinkCost +=
4680 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4681 ShrinkCost += ZExtCost;
4682 }
4683
4684 // If the other instruction operand is not a constant, we'll need to
4685 // generate a truncate instruction. So we have to adjust cost
4686 if (!isa<Constant>(OtherOperand))
4687 ShrinkCost += TTI.getCastInstrCost(
4688 Instruction::Trunc, SmallTy, BigTy,
4689 TargetTransformInfo::CastContextHint::None, CostKind);
4690
4691 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4692 // towards modifying the IR because shrinking opens opportunities for other
4693 // shrinking optimisations.
4694 if (ShrinkCost > CurrentCost)
4695 return false;
4696
4697 Builder.SetInsertPoint(&I);
4698 Value *Op0 = ZExted;
4699 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4700 // Keep the order of operands the same
4701 if (I.getOperand(0) == OtherOperand)
4702 std::swap(Op0, Op1);
4703 Value *NewBinOp =
4704 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4705 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4706 cast<Instruction>(NewBinOp)->copyMetadata(I);
4707 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4708 replaceValue(I, *NewZExtr);
4709 return true;
4710}
4711
4712/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4713/// shuffle (DstVec, SrcVec, Mask)
4714bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4715 Value *DstVec, *SrcVec;
4716 uint64_t ExtIdx, InsIdx;
4717 if (!match(&I,
4718 m_InsertElt(m_Value(DstVec),
4719 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4720 m_ConstantInt(InsIdx))))
4721 return false;
4722
4723 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4724 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4725 // We can try combining vectors with different element sizes.
4726 if (!DstVecTy || !SrcVecTy ||
4727 SrcVecTy->getElementType() != DstVecTy->getElementType())
4728 return false;
4729
4730 unsigned NumDstElts = DstVecTy->getNumElements();
4731 unsigned NumSrcElts = SrcVecTy->getNumElements();
4732 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4733 return false;
4734
4735 // Insertion into poison is a cheaper single operand shuffle.
4737 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4738
4739 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4740 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4741 if (NeedDstSrcSwap) {
4743 Mask[InsIdx] = ExtIdx % NumDstElts;
4744 std::swap(DstVec, SrcVec);
4745 } else {
4747 std::iota(Mask.begin(), Mask.end(), 0);
4748 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
4749 }
4750
4751 // Cost
4752 auto *Ins = cast<InsertElementInst>(&I);
4753 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4754 InstructionCost InsCost =
4755 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4756 InstructionCost ExtCost =
4757 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4758 InstructionCost OldCost = ExtCost + InsCost;
4759
4760 InstructionCost NewCost = 0;
4761 SmallVector<int> ExtToVecMask;
4762 if (!NeedExpOrNarrow) {
4763 // Ignore 'free' identity insertion shuffle.
4764 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4765 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4766 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4767 nullptr, {DstVec, SrcVec});
4768 } else {
4769 // When creating a length-changing-vector, always try to keep the relevant
4770 // element in an equivalent position, so that bulk shuffles are more likely
4771 // to be useful.
4772 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4773 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
4774 // Add cost for expanding or narrowing
4776 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4777 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4778 }
4779
4780 if (!Ext->hasOneUse())
4781 NewCost += ExtCost;
4782
4783 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4784 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4785 << "\n");
4786
4787 if (OldCost < NewCost)
4788 return false;
4789
4790 if (NeedExpOrNarrow) {
4791 if (!NeedDstSrcSwap)
4792 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4793 else
4794 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4795 }
4796
4797 // Canonicalize undef param to RHS to help further folds.
4798 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4799 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4800 std::swap(DstVec, SrcVec);
4801 }
4802
4803 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4804 replaceValue(I, *Shuf);
4805
4806 return true;
4807}
4808
4809/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4810/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4811/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4812/// before casting it back into `<vscale x 16 x i32>`.
4813bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4814 const APInt *SplatVal0, *SplatVal1;
4816 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4817 return false;
4818
4819 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4820 << "\n");
4821
4822 auto *VTy =
4823 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4824 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4825 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4826
4827 // Just in case the cost of interleave2 intrinsic and bitcast are both
4828 // invalid, in which case we want to bail out, we use <= rather
4829 // than < here. Even they both have valid and equal costs, it's probably
4830 // not a good idea to emit a high-cost constant splat.
4832 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4834 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4835 << *I.getType() << " is too high.\n");
4836 return false;
4837 }
4838
4839 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4840 NewSplatVal <<= Width;
4841 NewSplatVal |= SplatVal0->zext(Width * 2);
4842 auto *NewSplat = ConstantVector::getSplat(
4843 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4844
4845 IRBuilder<> Builder(&I);
4846 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4847 return true;
4848}
4849
4850// Attempt to shrink loads that are only used by shufflevector instructions.
4851bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4852 auto *OldLoad = dyn_cast<LoadInst>(&I);
4853 if (!OldLoad || !OldLoad->isSimple())
4854 return false;
4855
4856 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4857 if (!OldLoadTy)
4858 return false;
4859
4860 unsigned const OldNumElements = OldLoadTy->getNumElements();
4861
4862 // Search all uses of load. If all uses are shufflevector instructions, and
4863 // the second operands are all poison values, find the minimum and maximum
4864 // indices of the vector elements referenced by all shuffle masks.
4865 // Otherwise return `std::nullopt`.
4866 using IndexRange = std::pair<int, int>;
4867 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4868 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4869 for (llvm::Use &Use : I.uses()) {
4870 // Ensure all uses match the required pattern.
4871 User *Shuffle = Use.getUser();
4872 ArrayRef<int> Mask;
4873
4874 if (!match(Shuffle,
4875 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4876 return std::nullopt;
4877
4878 // Ignore shufflevector instructions that have no uses.
4879 if (Shuffle->use_empty())
4880 continue;
4881
4882 // Find the min and max indices used by the shufflevector instruction.
4883 for (int Index : Mask) {
4884 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4885 OutputRange.first = std::min(Index, OutputRange.first);
4886 OutputRange.second = std::max(Index, OutputRange.second);
4887 }
4888 }
4889 }
4890
4891 if (OutputRange.second < OutputRange.first)
4892 return std::nullopt;
4893
4894 return OutputRange;
4895 };
4896
4897 // Get the range of vector elements used by shufflevector instructions.
4898 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4899 unsigned const NewNumElements = Indices->second + 1u;
4900
4901 // If the range of vector elements is smaller than the full load, attempt
4902 // to create a smaller load.
4903 if (NewNumElements < OldNumElements) {
4904 IRBuilder Builder(&I);
4905 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4906
4907 // Calculate costs of old and new ops.
4908 Type *ElemTy = OldLoadTy->getElementType();
4909 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4910 Value *PtrOp = OldLoad->getPointerOperand();
4911
4913 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4914 OldLoad->getPointerAddressSpace(), CostKind);
4915 InstructionCost NewCost =
4916 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4917 OldLoad->getPointerAddressSpace(), CostKind);
4918
4919 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4921 unsigned const MaxIndex = NewNumElements * 2u;
4922
4923 for (llvm::Use &Use : I.uses()) {
4924 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4925 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4926
4927 // Create entry for new use.
4928 NewUses.push_back({Shuffle, OldMask});
4929
4930 // Validate mask indices.
4931 for (int Index : OldMask) {
4932 if (Index >= static_cast<int>(MaxIndex))
4933 return false;
4934 }
4935
4936 // Update costs.
4937 OldCost +=
4939 OldLoadTy, OldMask, CostKind);
4940 NewCost +=
4942 NewLoadTy, OldMask, CostKind);
4943 }
4944
4945 LLVM_DEBUG(
4946 dbgs() << "Found a load used only by shufflevector instructions: "
4947 << I << "\n OldCost: " << OldCost
4948 << " vs NewCost: " << NewCost << "\n");
4949
4950 if (OldCost < NewCost || !NewCost.isValid())
4951 return false;
4952
4953 // Create new load of smaller vector.
4954 auto *NewLoad = cast<LoadInst>(
4955 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4956 NewLoad->copyMetadata(I);
4957
4958 // Replace all uses.
4959 for (UseEntry &Use : NewUses) {
4960 ShuffleVectorInst *Shuffle = Use.first;
4961 std::vector<int> &NewMask = Use.second;
4962
4963 Builder.SetInsertPoint(Shuffle);
4964 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4965 Value *NewShuffle = Builder.CreateShuffleVector(
4966 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4967
4968 replaceValue(*Shuffle, *NewShuffle, false);
4969 }
4970
4971 return true;
4972 }
4973 }
4974 return false;
4975}
4976
4977// Attempt to narrow a phi of shufflevector instructions where the two incoming
4978// values have the same operands but different masks. If the two shuffle masks
4979// are offsets of one another we can use one branch to rotate the incoming
4980// vector and perform one larger shuffle after the phi.
4981bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4982 auto *Phi = dyn_cast<PHINode>(&I);
4983 if (!Phi || Phi->getNumIncomingValues() != 2u)
4984 return false;
4985
4986 Value *Op = nullptr;
4987 ArrayRef<int> Mask0;
4988 ArrayRef<int> Mask1;
4989
4990 if (!match(Phi->getOperand(0u),
4991 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4992 !match(Phi->getOperand(1u),
4993 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4994 return false;
4995
4996 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4997
4998 // Ensure result vectors are wider than the argument vector.
4999 auto *InputVT = cast<FixedVectorType>(Op->getType());
5000 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
5001 auto const InputNumElements = InputVT->getNumElements();
5002
5003 if (InputNumElements >= ResultVT->getNumElements())
5004 return false;
5005
5006 // Take the difference of the two shuffle masks at each index. Ignore poison
5007 // values at the same index in both masks.
5008 SmallVector<int, 16> NewMask;
5009 NewMask.reserve(Mask0.size());
5010
5011 for (auto [M0, M1] : zip(Mask0, Mask1)) {
5012 if (M0 >= 0 && M1 >= 0)
5013 NewMask.push_back(M0 - M1);
5014 else if (M0 == -1 && M1 == -1)
5015 continue;
5016 else
5017 return false;
5018 }
5019
5020 // Ensure all elements of the new mask are equal. If the difference between
5021 // the incoming mask elements is the same, the two must be constant offsets
5022 // of one another.
5023 if (NewMask.empty() || !all_equal(NewMask))
5024 return false;
5025
5026 // Create new mask using difference of the two incoming masks.
5027 int MaskOffset = NewMask[0u];
5028 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
5029 NewMask.clear();
5030
5031 for (unsigned I = 0u; I < InputNumElements; ++I) {
5032 NewMask.push_back(Index);
5033 Index = (Index + 1u) % InputNumElements;
5034 }
5035
5036 // Calculate costs for worst cases and compare.
5037 auto const Kind = TTI::SK_PermuteSingleSrc;
5038 auto OldCost =
5039 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
5040 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
5041 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
5042 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
5043
5044 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
5045 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5046 << "\n");
5047
5048 if (NewCost > OldCost)
5049 return false;
5050
5051 // Create new shuffles and narrowed phi.
5052 auto Builder = IRBuilder(Shuf);
5053 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
5054 auto *PoisonVal = PoisonValue::get(InputVT);
5055 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
5056 Worklist.push(cast<Instruction>(NewShuf0));
5057
5058 Builder.SetInsertPoint(Phi);
5059 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
5060 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
5061 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
5062 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
5063
5064 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
5065 PoisonVal = PoisonValue::get(NewPhi->getType());
5066 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
5067
5068 replaceValue(*Phi, *NewShuf1);
5069 return true;
5070}
5071
5072/// This is the entry point for all transforms. Pass manager differences are
5073/// handled in the callers of this function.
5074bool VectorCombine::run() {
5076 return false;
5077
5078 // Don't attempt vectorization if the target does not support vectors.
5079 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
5080 return false;
5081
5082 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
5083
5084 auto FoldInst = [this](Instruction &I) {
5085 Builder.SetInsertPoint(&I);
5086 bool IsVectorType = isa<VectorType>(I.getType());
5087 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
5088 auto Opcode = I.getOpcode();
5089
5090 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
5091
5092 // These folds should be beneficial regardless of when this pass is run
5093 // in the optimization pipeline.
5094 // The type checking is for run-time efficiency. We can avoid wasting time
5095 // dispatching to folding functions if there's no chance of matching.
5096 if (IsFixedVectorType) {
5097 switch (Opcode) {
5098 case Instruction::InsertElement:
5099 if (vectorizeLoadInsert(I))
5100 return true;
5101 break;
5102 case Instruction::ShuffleVector:
5103 if (widenSubvectorLoad(I))
5104 return true;
5105 break;
5106 default:
5107 break;
5108 }
5109 }
5110
5111 // This transform works with scalable and fixed vectors
5112 // TODO: Identify and allow other scalable transforms
5113 if (IsVectorType) {
5114 if (scalarizeOpOrCmp(I))
5115 return true;
5116 if (scalarizeLoad(I))
5117 return true;
5118 if (scalarizeExtExtract(I))
5119 return true;
5120 if (scalarizeVPIntrinsic(I))
5121 return true;
5122 if (foldInterleaveIntrinsics(I))
5123 return true;
5124 }
5125
5126 if (Opcode == Instruction::Store)
5127 if (foldSingleElementStore(I))
5128 return true;
5129
5130 // If this is an early pipeline invocation of this pass, we are done.
5131 if (TryEarlyFoldsOnly)
5132 return false;
5133
5134 // Otherwise, try folds that improve codegen but may interfere with
5135 // early IR canonicalizations.
5136 // The type checking is for run-time efficiency. We can avoid wasting time
5137 // dispatching to folding functions if there's no chance of matching.
5138 if (IsFixedVectorType) {
5139 switch (Opcode) {
5140 case Instruction::InsertElement:
5141 if (foldInsExtFNeg(I))
5142 return true;
5143 if (foldInsExtBinop(I))
5144 return true;
5145 if (foldInsExtVectorToShuffle(I))
5146 return true;
5147 break;
5148 case Instruction::ShuffleVector:
5149 if (foldPermuteOfBinops(I))
5150 return true;
5151 if (foldShuffleOfBinops(I))
5152 return true;
5153 if (foldShuffleOfSelects(I))
5154 return true;
5155 if (foldShuffleOfCastops(I))
5156 return true;
5157 if (foldShuffleOfShuffles(I))
5158 return true;
5159 if (foldPermuteOfIntrinsic(I))
5160 return true;
5161 if (foldShufflesOfLengthChangingShuffles(I))
5162 return true;
5163 if (foldShuffleOfIntrinsics(I))
5164 return true;
5165 if (foldSelectShuffle(I))
5166 return true;
5167 if (foldShuffleToIdentity(I))
5168 return true;
5169 break;
5170 case Instruction::Load:
5171 if (shrinkLoadForShuffles(I))
5172 return true;
5173 break;
5174 case Instruction::BitCast:
5175 if (foldBitcastShuffle(I))
5176 return true;
5177 if (foldSelectsFromBitcast(I))
5178 return true;
5179 break;
5180 case Instruction::And:
5181 case Instruction::Or:
5182 case Instruction::Xor:
5183 if (foldBitOpOfCastops(I))
5184 return true;
5185 if (foldBitOpOfCastConstant(I))
5186 return true;
5187 break;
5188 case Instruction::PHI:
5189 if (shrinkPhiOfShuffles(I))
5190 return true;
5191 break;
5192 default:
5193 if (shrinkType(I))
5194 return true;
5195 break;
5196 }
5197 } else {
5198 switch (Opcode) {
5199 case Instruction::Call:
5200 if (foldShuffleFromReductions(I))
5201 return true;
5202 if (foldCastFromReductions(I))
5203 return true;
5204 break;
5205 case Instruction::ExtractElement:
5206 if (foldShuffleChainsToReduce(I))
5207 return true;
5208 break;
5209 case Instruction::ICmp:
5210 case Instruction::FCmp:
5211 if (foldExtractExtract(I))
5212 return true;
5213 break;
5214 case Instruction::Or:
5215 if (foldConcatOfBoolMasks(I))
5216 return true;
5217 [[fallthrough]];
5218 default:
5219 if (Instruction::isBinaryOp(Opcode)) {
5220 if (foldExtractExtract(I))
5221 return true;
5222 if (foldExtractedCmps(I))
5223 return true;
5224 if (foldBinopOfReductions(I))
5225 return true;
5226 }
5227 break;
5228 }
5229 }
5230 return false;
5231 };
5232
5233 bool MadeChange = false;
5234 for (BasicBlock &BB : F) {
5235 // Ignore unreachable basic blocks.
5236 if (!DT.isReachableFromEntry(&BB))
5237 continue;
5238 // Use early increment range so that we can erase instructions in loop.
5239 // make_early_inc_range is not applicable here, as the next iterator may
5240 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5241 // We manually maintain the next instruction and update it when it is about
5242 // to be deleted.
5243 Instruction *I = &BB.front();
5244 while (I) {
5245 NextInst = I->getNextNode();
5246 if (!I->isDebugOrPseudoInst())
5247 MadeChange |= FoldInst(*I);
5248 I = NextInst;
5249 }
5250 }
5251
5252 NextInst = nullptr;
5253
5254 while (!Worklist.isEmpty()) {
5255 Instruction *I = Worklist.removeOne();
5256 if (!I)
5257 continue;
5258
5261 continue;
5262 }
5263
5264 MadeChange |= FoldInst(*I);
5265 }
5266
5267 return MadeChange;
5268}
5269
5272 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5274 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5275 AAResults &AA = FAM.getResult<AAManager>(F);
5276 const DataLayout *DL = &F.getDataLayout();
5277 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5278 TryEarlyFoldsOnly);
5279 if (!Combiner.run())
5280 return PreservedAnalyses::all();
5283 return PA;
5284}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2553
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2541
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2619
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2209
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2234
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2434
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2465
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2175
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2053
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2575
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2039
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:293
Value * getOperand(unsigned i) const
Definition User.h:233
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:963
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2198
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2163
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:299
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148