LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
39#include <numeric>
40#include <optional>
41#include <queue>
42#include <set>
43
44#define DEBUG_TYPE "vector-combine"
46
47using namespace llvm;
48using namespace llvm::PatternMatch;
49
50STATISTIC(NumVecLoad, "Number of vector loads formed");
51STATISTIC(NumVecCmp, "Number of vector compares formed");
52STATISTIC(NumVecBO, "Number of vector binops formed");
53STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
54STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
55STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
56STATISTIC(NumScalarCmp, "Number of scalar compares formed");
57STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
58
60 "disable-vector-combine", cl::init(false), cl::Hidden,
61 cl::desc("Disable all vector combine transforms"));
62
64 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
65 cl::desc("Disable binop extract to shuffle transforms"));
66
68 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
69 cl::desc("Max number of instructions to scan for vector combining."));
70
71static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
72
73namespace {
74class VectorCombine {
75public:
76 VectorCombine(Function &F, const TargetTransformInfo &TTI,
79 bool TryEarlyFoldsOnly)
80 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
81 DT(DT), AA(AA), DL(DL), CostKind(CostKind),
82 SQ(*DL, /*TLI=*/nullptr, &DT, &AC),
83 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
84
85 bool run();
86
87private:
88 Function &F;
90 const TargetTransformInfo &TTI;
91 const DominatorTree &DT;
92 AAResults &AA;
93 const DataLayout *DL;
94 TTI::TargetCostKind CostKind;
95 const SimplifyQuery SQ;
96
97 /// If true, only perform beneficial early IR transforms. Do not introduce new
98 /// vector operations.
99 bool TryEarlyFoldsOnly;
100
101 InstructionWorklist Worklist;
102
103 /// Next instruction to iterate. It will be updated when it is erased by
104 /// RecursivelyDeleteTriviallyDeadInstructions.
105 Instruction *NextInst;
106
107 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
108 // parameter. That should be updated to specific sub-classes because the
109 // run loop was changed to dispatch on opcode.
110 bool vectorizeLoadInsert(Instruction &I);
111 bool widenSubvectorLoad(Instruction &I);
112 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
113 ExtractElementInst *Ext1,
114 unsigned PreferredExtractIndex) const;
115 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
116 const Instruction &I,
117 ExtractElementInst *&ConvertToShuffle,
118 unsigned PreferredExtractIndex);
119 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 bool foldExtractExtract(Instruction &I);
122 bool foldInsExtFNeg(Instruction &I);
123 bool foldInsExtBinop(Instruction &I);
124 bool foldInsExtVectorToShuffle(Instruction &I);
125 bool foldBitOpOfCastops(Instruction &I);
126 bool foldBitOpOfCastConstant(Instruction &I);
127 bool foldBitcastShuffle(Instruction &I);
128 bool scalarizeOpOrCmp(Instruction &I);
129 bool scalarizeVPIntrinsic(Instruction &I);
130 bool foldExtractedCmps(Instruction &I);
131 bool foldSelectsFromBitcast(Instruction &I);
132 bool foldBinopOfReductions(Instruction &I);
133 bool foldSingleElementStore(Instruction &I);
134 bool scalarizeLoad(Instruction &I);
135 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
136 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeExtExtract(Instruction &I);
138 bool foldConcatOfBoolMasks(Instruction &I);
139 bool foldPermuteOfBinops(Instruction &I);
140 bool foldShuffleOfBinops(Instruction &I);
141 bool foldShuffleOfSelects(Instruction &I);
142 bool foldShuffleOfCastops(Instruction &I);
143 bool foldShuffleOfShuffles(Instruction &I);
144 bool foldPermuteOfIntrinsic(Instruction &I);
145 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
146 bool foldShuffleOfIntrinsics(Instruction &I);
147 bool foldShuffleToIdentity(Instruction &I);
148 bool foldShuffleFromReductions(Instruction &I);
149 bool foldShuffleChainsToReduce(Instruction &I);
150 bool foldCastFromReductions(Instruction &I);
151 bool foldSignBitReductionCmp(Instruction &I);
152 bool foldICmpEqZeroVectorReduce(Instruction &I);
153 bool foldEquivalentReductionCmp(Instruction &I);
154 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
155 bool foldInterleaveIntrinsics(Instruction &I);
156 bool shrinkType(Instruction &I);
157 bool shrinkLoadForShuffles(Instruction &I);
158 bool shrinkPhiOfShuffles(Instruction &I);
159
160 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
161 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
162 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
163 Old.replaceAllUsesWith(&New);
164 if (auto *NewI = dyn_cast<Instruction>(&New)) {
165 New.takeName(&Old);
166 Worklist.pushUsersToWorkList(*NewI);
167 Worklist.pushValue(NewI);
168 }
169 if (Erase && isInstructionTriviallyDead(&Old)) {
170 eraseInstruction(Old);
171 } else {
172 Worklist.push(&Old);
173 }
174 }
175
176 void eraseInstruction(Instruction &I) {
177 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
178 SmallVector<Value *> Ops(I.operands());
179 Worklist.remove(&I);
180 I.eraseFromParent();
181
182 // Push remaining users of the operands and then the operand itself - allows
183 // further folds that were hindered by OneUse limits.
184 SmallPtrSet<Value *, 4> Visited;
185 for (Value *Op : Ops) {
186 if (!Visited.contains(Op)) {
187 if (auto *OpI = dyn_cast<Instruction>(Op)) {
189 OpI, nullptr, nullptr, [&](Value *V) {
190 if (auto *I = dyn_cast<Instruction>(V)) {
191 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
192 Worklist.remove(I);
193 if (I == NextInst)
194 NextInst = NextInst->getNextNode();
195 Visited.insert(I);
196 }
197 }))
198 continue;
199 Worklist.pushUsersToWorkList(*OpI);
200 Worklist.pushValue(OpI);
201 }
202 }
203 }
204 }
205};
206} // namespace
207
208/// Return the source operand of a potentially bitcasted value. If there is no
209/// bitcast, return the input value itself.
211 while (auto *BitCast = dyn_cast<BitCastInst>(V))
212 V = BitCast->getOperand(0);
213 return V;
214}
215
216static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
217 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
218 // The widened load may load data from dirty regions or create data races
219 // non-existent in the source.
220 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
221 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
223 return false;
224
225 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
226 // sure we have all of our type-based constraints in place for this target.
227 Type *ScalarTy = Load->getType()->getScalarType();
228 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
229 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
230 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
231 ScalarSize % 8 != 0)
232 return false;
233
234 return true;
235}
236
237bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
238 // Match insert into fixed vector of scalar value.
239 // TODO: Handle non-zero insert index.
240 Value *Scalar;
241 if (!match(&I,
243 return false;
244
245 // Optionally match an extract from another vector.
246 Value *X;
247 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
248 if (!HasExtract)
249 X = Scalar;
250
251 auto *Load = dyn_cast<LoadInst>(X);
252 if (!canWidenLoad(Load, TTI))
253 return false;
254
255 Type *ScalarTy = Scalar->getType();
256 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
257 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
258
259 // Check safety of replacing the scalar load with a larger vector load.
260 // We use minimal alignment (maximum flexibility) because we only care about
261 // the dereferenceable region. When calculating cost and creating a new op,
262 // we may use a larger value based on alignment attributes.
263 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
264 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
265
266 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
267 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
268 unsigned OffsetEltIndex = 0;
269 Align Alignment = Load->getAlign();
270 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, SQ.AC,
271 SQ.DT)) {
272 // It is not safe to load directly from the pointer, but we can still peek
273 // through gep offsets and check if it safe to load from a base address with
274 // updated alignment. If it is, we can shuffle the element(s) into place
275 // after loading.
276 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
277 APInt Offset(OffsetBitWidth, 0);
279
280 // We want to shuffle the result down from a high element of a vector, so
281 // the offset must be positive.
282 if (Offset.isNegative())
283 return false;
284
285 // The offset must be a multiple of the scalar element to shuffle cleanly
286 // in the element's size.
287 uint64_t ScalarSizeInBytes = ScalarSize / 8;
288 if (Offset.urem(ScalarSizeInBytes) != 0)
289 return false;
290
291 // If we load MinVecNumElts, will our target element still be loaded?
292 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
293 if (OffsetEltIndex >= MinVecNumElts)
294 return false;
295
296 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load,
297 SQ.AC, SQ.DT))
298 return false;
299
300 // Update alignment with offset value. Note that the offset could be negated
301 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
302 // negation does not change the result of the alignment calculation.
303 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
304 }
305
306 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
307 // Use the greater of the alignment on the load or its source pointer.
308 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
309 Type *LoadTy = Load->getType();
310 unsigned AS = Load->getPointerAddressSpace();
311 InstructionCost OldCost =
312 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
313 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
314 OldCost +=
315 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
316 /* Insert */ true, HasExtract, CostKind);
317
318 // New pattern: load VecPtr
319 InstructionCost NewCost =
320 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
321 // Optionally, we are shuffling the loaded vector element(s) into place.
322 // For the mask set everything but element 0 to undef to prevent poison from
323 // propagating from the extra loaded memory. This will also optionally
324 // shrink/grow the vector from the loaded size to the output size.
325 // We assume this operation has no cost in codegen if there was no offset.
326 // Note that we could use freeze to avoid poison problems, but then we might
327 // still need a shuffle to change the vector size.
328 auto *Ty = cast<FixedVectorType>(I.getType());
329 unsigned OutputNumElts = Ty->getNumElements();
330 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
331 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
332 Mask[0] = OffsetEltIndex;
333 if (OffsetEltIndex)
334 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
335 CostKind);
336
337 // We can aggressively convert to the vector form because the backend can
338 // invert this transform if it does not result in a performance win.
339 if (OldCost < NewCost || !NewCost.isValid())
340 return false;
341
342 // It is safe and potentially profitable to load a vector directly:
343 // inselt undef, load Scalar, 0 --> load VecPtr
344 IRBuilder<> Builder(Load);
345 Value *CastedPtr =
346 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
347 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
348 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
349
350 replaceValue(I, *VecLd);
351 ++NumVecLoad;
352 return true;
353}
354
355/// If we are loading a vector and then inserting it into a larger vector with
356/// undefined elements, try to load the larger vector and eliminate the insert.
357/// This removes a shuffle in IR and may allow combining of other loaded values.
358bool VectorCombine::widenSubvectorLoad(Instruction &I) {
359 // Match subvector insert of fixed vector.
360 auto *Shuf = cast<ShuffleVectorInst>(&I);
361 if (!Shuf->isIdentityWithPadding())
362 return false;
363
364 // Allow a non-canonical shuffle mask that is choosing elements from op1.
365 unsigned NumOpElts =
366 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
367 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
368 return M >= (int)(NumOpElts);
369 });
370
371 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
372 if (!canWidenLoad(Load, TTI))
373 return false;
374
375 // We use minimal alignment (maximum flexibility) because we only care about
376 // the dereferenceable region. When calculating cost and creating a new op,
377 // we may use a larger value based on alignment attributes.
378 auto *Ty = cast<FixedVectorType>(I.getType());
379 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
380 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
381 Align Alignment = Load->getAlign();
382 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, SQ.AC,
383 SQ.DT))
384 return false;
385
386 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
387 Type *LoadTy = Load->getType();
388 unsigned AS = Load->getPointerAddressSpace();
389
390 // Original pattern: insert_subvector (load PtrOp)
391 // This conservatively assumes that the cost of a subvector insert into an
392 // undef value is 0. We could add that cost if the cost model accurately
393 // reflects the real cost of that operation.
394 InstructionCost OldCost =
395 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
396
397 // New pattern: load PtrOp
398 InstructionCost NewCost =
399 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
400
401 // We can aggressively convert to the vector form because the backend can
402 // invert this transform if it does not result in a performance win.
403 if (OldCost < NewCost || !NewCost.isValid())
404 return false;
405
406 IRBuilder<> Builder(Load);
407 Value *CastedPtr =
408 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
409 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
410 replaceValue(I, *VecLd);
411 ++NumVecLoad;
412 return true;
413}
414
415/// Determine which, if any, of the inputs should be replaced by a shuffle
416/// followed by extract from a different index.
417ExtractElementInst *VectorCombine::getShuffleExtract(
418 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
419 unsigned PreferredExtractIndex = InvalidIndex) const {
420 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
421 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
422 assert(Index0C && Index1C && "Expected constant extract indexes");
423
424 unsigned Index0 = Index0C->getZExtValue();
425 unsigned Index1 = Index1C->getZExtValue();
426
427 // If the extract indexes are identical, no shuffle is needed.
428 if (Index0 == Index1)
429 return nullptr;
430
431 Type *VecTy = Ext0->getVectorOperand()->getType();
432 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
433 InstructionCost Cost0 =
434 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
435 InstructionCost Cost1 =
436 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
437
438 // If both costs are invalid no shuffle is needed
439 if (!Cost0.isValid() && !Cost1.isValid())
440 return nullptr;
441
442 // We are extracting from 2 different indexes, so one operand must be shuffled
443 // before performing a vector operation and/or extract. The more expensive
444 // extract will be replaced by a shuffle.
445 if (Cost0 > Cost1)
446 return Ext0;
447 if (Cost1 > Cost0)
448 return Ext1;
449
450 // If the costs are equal and there is a preferred extract index, shuffle the
451 // opposite operand.
452 if (PreferredExtractIndex == Index0)
453 return Ext1;
454 if (PreferredExtractIndex == Index1)
455 return Ext0;
456
457 // Otherwise, replace the extract with the higher index.
458 return Index0 > Index1 ? Ext0 : Ext1;
459}
460
461/// Compare the relative costs of 2 extracts followed by scalar operation vs.
462/// vector operation(s) followed by extract. Return true if the existing
463/// instructions are cheaper than a vector alternative. Otherwise, return false
464/// and if one of the extracts should be transformed to a shufflevector, set
465/// \p ConvertToShuffle to that extract instruction.
466bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
467 ExtractElementInst *Ext1,
468 const Instruction &I,
469 ExtractElementInst *&ConvertToShuffle,
470 unsigned PreferredExtractIndex) {
471 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
472 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
473 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
474
475 unsigned Opcode = I.getOpcode();
476 Value *Ext0Src = Ext0->getVectorOperand();
477 Value *Ext1Src = Ext1->getVectorOperand();
478 Type *ScalarTy = Ext0->getType();
479 auto *VecTy = cast<VectorType>(Ext0Src->getType());
480 InstructionCost ScalarOpCost, VectorOpCost;
481
482 // Get cost estimates for scalar and vector versions of the operation.
483 bool IsBinOp = Instruction::isBinaryOp(Opcode);
484 if (IsBinOp) {
485 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
486 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
487 } else {
488 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
489 "Expected a compare");
490 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
491 ScalarOpCost = TTI.getCmpSelInstrCost(
492 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
493 VectorOpCost = TTI.getCmpSelInstrCost(
494 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
495 }
496
497 // Get cost estimates for the extract elements. These costs will factor into
498 // both sequences.
499 unsigned Ext0Index = Ext0IndexC->getZExtValue();
500 unsigned Ext1Index = Ext1IndexC->getZExtValue();
501
502 InstructionCost Extract0Cost =
503 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
504 InstructionCost Extract1Cost =
505 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
506
507 // A more expensive extract will always be replaced by a splat shuffle.
508 // For example, if Ext0 is more expensive:
509 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
510 // extelt (opcode (splat V0, Ext0), V1), Ext1
511 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
512 // check the cost of creating a broadcast shuffle and shuffling both
513 // operands to element 0.
514 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
515 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
516 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
517
518 // Extra uses of the extracts mean that we include those costs in the
519 // vector total because those instructions will not be eliminated.
520 InstructionCost OldCost, NewCost;
521 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
522 // Handle a special case. If the 2 extracts are identical, adjust the
523 // formulas to account for that. The extra use charge allows for either the
524 // CSE'd pattern or an unoptimized form with identical values:
525 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
526 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
527 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
528 OldCost = CheapExtractCost + ScalarOpCost;
529 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
530 } else {
531 // Handle the general case. Each extract is actually a different value:
532 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
533 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
534 NewCost = VectorOpCost + CheapExtractCost +
535 !Ext0->hasOneUse() * Extract0Cost +
536 !Ext1->hasOneUse() * Extract1Cost;
537 }
538
539 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
540 if (ConvertToShuffle) {
541 if (IsBinOp && DisableBinopExtractShuffle)
542 return true;
543
544 // If we are extracting from 2 different indexes, then one operand must be
545 // shuffled before performing the vector operation. The shuffle mask is
546 // poison except for 1 lane that is being translated to the remaining
547 // extraction lane. Therefore, it is a splat shuffle. Ex:
548 // ShufMask = { poison, poison, 0, poison }
549 // TODO: The cost model has an option for a "broadcast" shuffle
550 // (splat-from-element-0), but no option for a more general splat.
551 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
552 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
554 ShuffleMask[BestInsIndex] = BestExtIndex;
556 VecTy, VecTy, ShuffleMask, CostKind, 0,
557 nullptr, {ConvertToShuffle});
558 } else {
560 VecTy, VecTy, {}, CostKind, 0, nullptr,
561 {ConvertToShuffle});
562 }
563 }
564
565 // Aggressively form a vector op if the cost is equal because the transform
566 // may enable further optimization.
567 // Codegen can reverse this transform (scalarize) if it was not profitable.
568 return OldCost < NewCost;
569}
570
571/// Create a shuffle that translates (shifts) 1 element from the input vector
572/// to a new element location.
573static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
574 unsigned NewIndex, IRBuilderBase &Builder) {
575 // The shuffle mask is poison except for 1 lane that is being translated
576 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
577 // ShufMask = { 2, poison, poison, poison }
578 auto *VecTy = cast<FixedVectorType>(Vec->getType());
579 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
580 ShufMask[NewIndex] = OldIndex;
581 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
582}
583
584/// Given an extract element instruction with constant index operand, shuffle
585/// the source vector (shift the scalar element) to a NewIndex for extraction.
586/// Return null if the input can be constant folded, so that we are not creating
587/// unnecessary instructions.
588static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
589 IRBuilderBase &Builder) {
590 // Shufflevectors can only be created for fixed-width vectors.
591 Value *X = ExtElt->getVectorOperand();
592 if (!isa<FixedVectorType>(X->getType()))
593 return nullptr;
594
595 // If the extract can be constant-folded, this code is unsimplified. Defer
596 // to other passes to handle that.
597 Value *C = ExtElt->getIndexOperand();
598 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
599 if (isa<Constant>(X))
600 return nullptr;
601
602 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
603 NewIndex, Builder);
604 return Shuf;
605}
606
607/// Try to reduce extract element costs by converting scalar compares to vector
608/// compares followed by extract.
609/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
610Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
611 Instruction &I) {
612 assert(isa<CmpInst>(&I) && "Expected a compare");
613
614 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
615 // --> extelt (cmp Pred V0, V1), ExtIndex
616 ++NumVecCmp;
617 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
618 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
619 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
620}
621
622/// Try to reduce extract element costs by converting scalar binops to vector
623/// binops followed by extract.
624/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
625Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
626 Instruction &I) {
627 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
628
629 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
630 // --> extelt (bo V0, V1), ExtIndex
631 ++NumVecBO;
632 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
633 V1, "foldExtExtBinop");
634
635 // All IR flags are safe to back-propagate because any potential poison
636 // created in unused vector elements is discarded by the extract.
637 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
638 VecBOInst->copyIRFlags(&I);
639
640 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
641}
642
643/// Match an instruction with extracted vector operands.
644bool VectorCombine::foldExtractExtract(Instruction &I) {
645 // It is not safe to transform things like div, urem, etc. because we may
646 // create undefined behavior when executing those on unknown vector elements.
648 return false;
649
650 Instruction *I0, *I1;
651 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
652 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
654 return false;
655
656 Value *V0, *V1;
657 uint64_t C0, C1;
658 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
659 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
660 V0->getType() != V1->getType())
661 return false;
662
663 // If the scalar value 'I' is going to be re-inserted into a vector, then try
664 // to create an extract to that same element. The extract/insert can be
665 // reduced to a "select shuffle".
666 // TODO: If we add a larger pattern match that starts from an insert, this
667 // probably becomes unnecessary.
668 auto *Ext0 = cast<ExtractElementInst>(I0);
669 auto *Ext1 = cast<ExtractElementInst>(I1);
670 uint64_t InsertIndex = InvalidIndex;
671 if (I.hasOneUse())
672 match(I.user_back(),
673 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
674
675 ExtractElementInst *ExtractToChange;
676 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
677 return false;
678
679 Value *ExtOp0 = Ext0->getVectorOperand();
680 Value *ExtOp1 = Ext1->getVectorOperand();
681
682 if (ExtractToChange) {
683 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
684 Value *NewExtOp =
685 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
686 if (!NewExtOp)
687 return false;
688 if (ExtractToChange == Ext0)
689 ExtOp0 = NewExtOp;
690 else
691 ExtOp1 = NewExtOp;
692 }
693
694 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
695 : Ext0->getIndexOperand();
696 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
697 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
698 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
699 Worklist.push(Ext0);
700 Worklist.push(Ext1);
701 replaceValue(I, *NewExt);
702 return true;
703}
704
705/// Try to replace an extract + scalar fneg + insert with a vector fneg +
706/// shuffle.
707bool VectorCombine::foldInsExtFNeg(Instruction &I) {
708 // Match an insert (op (extract)) pattern.
709 Value *DstVec;
710 uint64_t ExtIdx, InsIdx;
711 Instruction *FNeg;
712 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
713 m_ConstantInt(InsIdx))))
714 return false;
715
716 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
717 Value *SrcVec;
718 Instruction *Extract;
719 if (!match(FNeg, m_FNeg(m_CombineAnd(
720 m_Instruction(Extract),
721 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
722 return false;
723
724 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
725 auto *DstVecScalarTy = DstVecTy->getScalarType();
726 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
727 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
728 return false;
729
730 // Ignore if insert/extract index is out of bounds or destination vector has
731 // one element
732 unsigned NumDstElts = DstVecTy->getNumElements();
733 unsigned NumSrcElts = SrcVecTy->getNumElements();
734 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
735 return false;
736
737 // We are inserting the negated element into the same lane that we extracted
738 // from. This is equivalent to a select-shuffle that chooses all but the
739 // negated element from the destination vector.
740 SmallVector<int> Mask(NumDstElts);
741 std::iota(Mask.begin(), Mask.end(), 0);
742 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
743 InstructionCost OldCost =
744 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
745 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
746
747 // If the extract has one use, it will be eliminated, so count it in the
748 // original cost. If it has more than one use, ignore the cost because it will
749 // be the same before/after.
750 if (Extract->hasOneUse())
751 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
752
753 InstructionCost NewCost =
754 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
756 DstVecTy, Mask, CostKind);
757
758 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
759 // If the lengths of the two vectors are not equal,
760 // we need to add a length-change vector. Add this cost.
761 SmallVector<int> SrcMask;
762 if (NeedLenChg) {
763 SrcMask.assign(NumDstElts, PoisonMaskElem);
764 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
766 DstVecTy, SrcVecTy, SrcMask, CostKind);
767 }
768
769 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
770 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
771 << "\n");
772 if (NewCost > OldCost)
773 return false;
774
775 Value *NewShuf, *LenChgShuf = nullptr;
776 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
777 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
778 if (NeedLenChg) {
779 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
780 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
781 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
782 Worklist.pushValue(LenChgShuf);
783 } else {
784 // shuffle DstVec, (fneg SrcVec), Mask
785 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
786 }
787
788 Worklist.pushValue(VecFNeg);
789 replaceValue(I, *NewShuf);
790 return true;
791}
792
793/// Try to fold insert(binop(x,y),binop(a,b),idx)
794/// --> binop(insert(x,a,idx),insert(y,b,idx))
795bool VectorCombine::foldInsExtBinop(Instruction &I) {
796 BinaryOperator *VecBinOp, *SclBinOp;
797 uint64_t Index;
798 if (!match(&I,
799 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
800 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
801 return false;
802
803 // TODO: Add support for addlike etc.
804 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
805 if (BinOpcode != SclBinOp->getOpcode())
806 return false;
807
808 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
809 if (!ResultTy)
810 return false;
811
812 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
813 // shuffle?
814
816 TTI.getInstructionCost(VecBinOp, CostKind) +
818 InstructionCost NewCost =
819 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
820 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
821 Index, VecBinOp->getOperand(0),
822 SclBinOp->getOperand(0)) +
823 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
824 Index, VecBinOp->getOperand(1),
825 SclBinOp->getOperand(1));
826
827 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
828 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
829 << "\n");
830 if (NewCost > OldCost)
831 return false;
832
833 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
834 SclBinOp->getOperand(0), Index);
835 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
836 SclBinOp->getOperand(1), Index);
837 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
838
839 // Intersect flags from the old binops.
840 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
841 NewInst->copyIRFlags(VecBinOp);
842 NewInst->andIRFlags(SclBinOp);
843 }
844
845 Worklist.pushValue(NewIns0);
846 Worklist.pushValue(NewIns1);
847 replaceValue(I, *NewBO);
848 return true;
849}
850
851/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
852/// Supports: bitcast, trunc, sext, zext
853bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
854 // Check if this is a bitwise logic operation
855 auto *BinOp = dyn_cast<BinaryOperator>(&I);
856 if (!BinOp || !BinOp->isBitwiseLogicOp())
857 return false;
858
859 // Get the cast instructions
860 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
861 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
862 if (!LHSCast || !RHSCast) {
863 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
864 return false;
865 }
866
867 // Both casts must be the same type
868 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
869 if (CastOpcode != RHSCast->getOpcode())
870 return false;
871
872 // Only handle supported cast operations
873 switch (CastOpcode) {
874 case Instruction::BitCast:
875 case Instruction::Trunc:
876 case Instruction::SExt:
877 case Instruction::ZExt:
878 break;
879 default:
880 return false;
881 }
882
883 Value *LHSSrc = LHSCast->getOperand(0);
884 Value *RHSSrc = RHSCast->getOperand(0);
885
886 // Source types must match
887 if (LHSSrc->getType() != RHSSrc->getType())
888 return false;
889
890 auto *SrcTy = LHSSrc->getType();
891 auto *DstTy = I.getType();
892 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
893 // Other casts only handle vector types with integer elements.
894 if (CastOpcode != Instruction::BitCast &&
895 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
896 return false;
897
898 // Only integer scalar/vector values are legal for bitwise logic operations.
899 if (!SrcTy->getScalarType()->isIntegerTy() ||
900 !DstTy->getScalarType()->isIntegerTy())
901 return false;
902
903 // Cost Check :
904 // OldCost = bitlogic + 2*casts
905 // NewCost = bitlogic + cast
906
907 // Calculate specific costs for each cast with instruction context
909 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
911 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
912
913 InstructionCost OldCost =
914 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
915 LHSCastCost + RHSCastCost;
916
917 // For new cost, we can't provide an instruction (it doesn't exist yet)
918 InstructionCost GenericCastCost = TTI.getCastInstrCost(
919 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
920
921 InstructionCost NewCost =
922 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
923 GenericCastCost;
924
925 // Account for multi-use casts using specific costs
926 if (!LHSCast->hasOneUse())
927 NewCost += LHSCastCost;
928 if (!RHSCast->hasOneUse())
929 NewCost += RHSCastCost;
930
931 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
932 << " NewCost=" << NewCost << "\n");
933
934 if (NewCost > OldCost)
935 return false;
936
937 // Create the operation on the source type
938 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
939 BinOp->getName() + ".inner");
940 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
941 NewBinOp->copyIRFlags(BinOp);
942
943 Worklist.pushValue(NewOp);
944
945 // Create the cast operation directly to ensure we get a new instruction
946 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
947
948 // Preserve cast instruction flags
949 NewCast->copyIRFlags(LHSCast);
950 NewCast->andIRFlags(RHSCast);
951
952 // Insert the new instruction
953 Value *Result = Builder.Insert(NewCast);
954
955 replaceValue(I, *Result);
956 return true;
957}
958
959/// Match:
960// bitop(castop(x), C) ->
961// bitop(castop(x), castop(InvC)) ->
962// castop(bitop(x, InvC))
963// Supports: bitcast
964bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
966 Constant *C;
967
968 // Check if this is a bitwise logic operation
970 return false;
971
972 // Get the cast instructions
973 auto *LHSCast = dyn_cast<CastInst>(LHS);
974 if (!LHSCast)
975 return false;
976
977 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
978
979 // Only handle supported cast operations
980 switch (CastOpcode) {
981 case Instruction::BitCast:
982 case Instruction::ZExt:
983 case Instruction::SExt:
984 case Instruction::Trunc:
985 break;
986 default:
987 return false;
988 }
989
990 Value *LHSSrc = LHSCast->getOperand(0);
991
992 auto *SrcTy = LHSSrc->getType();
993 auto *DstTy = I.getType();
994 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
995 // Other casts only handle vector types with integer elements.
996 if (CastOpcode != Instruction::BitCast &&
997 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
998 return false;
999
1000 // Only integer scalar/vector values are legal for bitwise logic operations.
1001 if (!SrcTy->getScalarType()->isIntegerTy() ||
1002 !DstTy->getScalarType()->isIntegerTy())
1003 return false;
1004
1005 // Find the constant InvC, such that castop(InvC) equals to C.
1006 PreservedCastFlags RHSFlags;
1007 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1008 if (!InvC)
1009 return false;
1010
1011 // Cost Check :
1012 // OldCost = bitlogic + cast
1013 // NewCost = bitlogic + cast
1014
1015 // Calculate specific costs for each cast with instruction context
1016 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1017 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1018
1019 InstructionCost OldCost =
1020 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1021
1022 // For new cost, we can't provide an instruction (it doesn't exist yet)
1023 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1024 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1025
1026 InstructionCost NewCost =
1027 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1028 GenericCastCost;
1029
1030 // Account for multi-use casts using specific costs
1031 if (!LHSCast->hasOneUse())
1032 NewCost += LHSCastCost;
1033
1034 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1035 << " NewCost=" << NewCost << "\n");
1036
1037 if (NewCost > OldCost)
1038 return false;
1039
1040 // Create the operation on the source type
1041 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1042 LHSSrc, InvC, I.getName() + ".inner");
1043 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1044 NewBinOp->copyIRFlags(&I);
1045
1046 Worklist.pushValue(NewOp);
1047
1048 // Create the cast operation directly to ensure we get a new instruction
1049 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1050
1051 // Preserve cast instruction flags
1052 if (RHSFlags.NNeg)
1053 NewCast->setNonNeg();
1054 if (RHSFlags.NUW)
1055 NewCast->setHasNoUnsignedWrap();
1056 if (RHSFlags.NSW)
1057 NewCast->setHasNoSignedWrap();
1058
1059 NewCast->andIRFlags(LHSCast);
1060
1061 // Insert the new instruction
1062 Value *Result = Builder.Insert(NewCast);
1063
1064 replaceValue(I, *Result);
1065 return true;
1066}
1067
1068/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1069/// destination type followed by shuffle. This can enable further transforms by
1070/// moving bitcasts or shuffles together.
1071bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1072 Value *V0, *V1;
1073 ArrayRef<int> Mask;
1074 if (!match(&I, m_BitCast(m_OneUse(
1075 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1076 return false;
1077
1078 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1079 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1080 // mask for scalable type is a splat or not.
1081 // 2) Disallow non-vector casts.
1082 // TODO: We could allow any shuffle.
1083 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1084 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1085 if (!DestTy || !SrcTy)
1086 return false;
1087
1088 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1089 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1090 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1091 return false;
1092
1093 bool IsUnary = isa<UndefValue>(V1);
1094
1095 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1096 // if it won't increase the number of bitcasts.
1097 if (!IsUnary) {
1100 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1101 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1102 return false;
1103 }
1104
1105 SmallVector<int, 16> NewMask;
1106 if (DestEltSize <= SrcEltSize) {
1107 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1108 // always be expanded to the equivalent form choosing narrower elements.
1109 if (SrcEltSize % DestEltSize != 0)
1110 return false;
1111 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1112 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1113 } else {
1114 // The bitcast is from narrow elements to wide elements. The shuffle mask
1115 // must choose consecutive elements to allow casting first.
1116 if (DestEltSize % SrcEltSize != 0)
1117 return false;
1118 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1119 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1120 return false;
1121 }
1122
1123 // Bitcast the shuffle src - keep its original width but using the destination
1124 // scalar type.
1125 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1126 auto *NewShuffleTy =
1127 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1128 auto *OldShuffleTy =
1129 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1130 unsigned NumOps = IsUnary ? 1 : 2;
1131
1132 // The new shuffle must not cost more than the old shuffle.
1136
1137 InstructionCost NewCost =
1138 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1139 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1140 TargetTransformInfo::CastContextHint::None,
1141 CostKind));
1142 InstructionCost OldCost =
1143 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1144 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1145 TargetTransformInfo::CastContextHint::None,
1146 CostKind);
1147
1148 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1149 << OldCost << " vs NewCost: " << NewCost << "\n");
1150
1151 if (NewCost > OldCost || !NewCost.isValid())
1152 return false;
1153
1154 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1155 ++NumShufOfBitcast;
1156 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1157 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1158 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1159 replaceValue(I, *Shuf);
1160 return true;
1161}
1162
1163/// VP Intrinsics whose vector operands are both splat values may be simplified
1164/// into the scalar version of the operation and the result splatted. This
1165/// can lead to scalarization down the line.
1166bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1167 if (!isa<VPIntrinsic>(I))
1168 return false;
1169 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1170 Value *Op0 = VPI.getArgOperand(0);
1171 Value *Op1 = VPI.getArgOperand(1);
1172
1173 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1174 return false;
1175
1176 // Check getSplatValue early in this function, to avoid doing unnecessary
1177 // work.
1178 Value *ScalarOp0 = getSplatValue(Op0);
1179 Value *ScalarOp1 = getSplatValue(Op1);
1180 if (!ScalarOp0 || !ScalarOp1)
1181 return false;
1182
1183 // For the binary VP intrinsics supported here, the result on disabled lanes
1184 // is a poison value. For now, only do this simplification if all lanes
1185 // are active.
1186 // TODO: Relax the condition that all lanes are active by using insertelement
1187 // on inactive lanes.
1188 auto IsAllTrueMask = [](Value *MaskVal) {
1189 if (Value *SplattedVal = getSplatValue(MaskVal))
1190 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1191 return ConstValue->isAllOnesValue();
1192 return false;
1193 };
1194 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1195 return false;
1196
1197 // Check to make sure we support scalarization of the intrinsic
1198 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1199 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1200 return false;
1201
1202 // Calculate cost of splatting both operands into vectors and the vector
1203 // intrinsic
1204 VectorType *VecTy = cast<VectorType>(VPI.getType());
1205 SmallVector<int> Mask;
1206 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1207 Mask.resize(FVTy->getNumElements(), 0);
1208 InstructionCost SplatCost =
1209 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1211 CostKind);
1212
1213 // Calculate the cost of the VP Intrinsic
1215 for (Value *V : VPI.args())
1216 Args.push_back(V->getType());
1217 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1218 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1219 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1220
1221 // Determine scalar opcode
1222 std::optional<unsigned> FunctionalOpcode =
1223 VPI.getFunctionalOpcode();
1224 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1225 if (!FunctionalOpcode) {
1226 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1227 if (!ScalarIntrID)
1228 return false;
1229 }
1230
1231 // Calculate cost of scalarizing
1232 InstructionCost ScalarOpCost = 0;
1233 if (ScalarIntrID) {
1234 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1235 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1236 } else {
1237 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1238 VecTy->getScalarType(), CostKind);
1239 }
1240
1241 // The existing splats may be kept around if other instructions use them.
1242 InstructionCost CostToKeepSplats =
1243 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1244 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1245
1246 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1247 << "\n");
1248 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1249 << ", Cost of scalarizing:" << NewCost << "\n");
1250
1251 // We want to scalarize unless the vector variant actually has lower cost.
1252 if (OldCost < NewCost || !NewCost.isValid())
1253 return false;
1254
1255 // Scalarize the intrinsic
1256 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1257 Value *EVL = VPI.getArgOperand(3);
1258
1259 // If the VP op might introduce UB or poison, we can scalarize it provided
1260 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1261 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1262 // scalarizing it.
1263 bool SafeToSpeculate;
1264 if (ScalarIntrID)
1265 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1266 .hasAttribute(Attribute::AttrKind::Speculatable);
1267 else
1269 *FunctionalOpcode, &VPI, nullptr, SQ.AC, SQ.DT);
1270 if (!SafeToSpeculate &&
1271 !isKnownNonZero(EVL, SimplifyQuery(*DL, SQ.DT, SQ.AC, &VPI)))
1272 return false;
1273
1274 Value *ScalarVal =
1275 ScalarIntrID
1276 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1277 {ScalarOp0, ScalarOp1})
1278 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1279 ScalarOp0, ScalarOp1);
1280
1281 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1282 return true;
1283}
1284
1285/// Match a vector op/compare/intrinsic with at least one
1286/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1287/// by insertelement.
1288bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1289 auto *UO = dyn_cast<UnaryOperator>(&I);
1290 auto *BO = dyn_cast<BinaryOperator>(&I);
1291 auto *CI = dyn_cast<CmpInst>(&I);
1292 auto *II = dyn_cast<IntrinsicInst>(&I);
1293 if (!UO && !BO && !CI && !II)
1294 return false;
1295
1296 // TODO: Allow intrinsics with different argument types
1297 if (II) {
1298 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1299 return false;
1300 for (auto [Idx, Arg] : enumerate(II->args()))
1301 if (Arg->getType() != II->getType() &&
1302 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1303 return false;
1304 }
1305
1306 // Do not convert the vector condition of a vector select into a scalar
1307 // condition. That may cause problems for codegen because of differences in
1308 // boolean formats and register-file transfers.
1309 // TODO: Can we account for that in the cost model?
1310 if (CI)
1311 for (User *U : I.users())
1312 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1313 return false;
1314
1315 // Match constant vectors or scalars being inserted into constant vectors:
1316 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1317 SmallVector<Value *> VecCs, ScalarOps;
1318 std::optional<uint64_t> Index;
1319
1320 auto Ops = II ? II->args() : I.operands();
1321 for (auto [OpNum, Op] : enumerate(Ops)) {
1322 Constant *VecC;
1323 Value *V;
1324 uint64_t InsIdx = 0;
1325 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1326 m_ConstantInt(InsIdx)))) {
1327 // Bail if any inserts are out of bounds.
1328 VectorType *OpTy = cast<VectorType>(Op->getType());
1329 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1330 return false;
1331 // All inserts must have the same index.
1332 // TODO: Deal with mismatched index constants and variable indexes?
1333 if (!Index)
1334 Index = InsIdx;
1335 else if (InsIdx != *Index)
1336 return false;
1337 VecCs.push_back(VecC);
1338 ScalarOps.push_back(V);
1339 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1340 OpNum, &TTI)) {
1341 VecCs.push_back(Op.get());
1342 ScalarOps.push_back(Op.get());
1343 } else if (match(Op.get(), m_Constant(VecC))) {
1344 VecCs.push_back(VecC);
1345 ScalarOps.push_back(nullptr);
1346 } else {
1347 return false;
1348 }
1349 }
1350
1351 // Bail if all operands are constant.
1352 if (!Index.has_value())
1353 return false;
1354
1355 VectorType *VecTy = cast<VectorType>(I.getType());
1356 Type *ScalarTy = VecTy->getScalarType();
1357 assert(VecTy->isVectorTy() &&
1358 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1359 ScalarTy->isPointerTy()) &&
1360 "Unexpected types for insert element into binop or cmp");
1361
1362 unsigned Opcode = I.getOpcode();
1363 InstructionCost ScalarOpCost, VectorOpCost;
1364 if (CI) {
1365 CmpInst::Predicate Pred = CI->getPredicate();
1366 ScalarOpCost = TTI.getCmpSelInstrCost(
1367 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1368 VectorOpCost = TTI.getCmpSelInstrCost(
1369 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1370 } else if (UO || BO) {
1371 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1372 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1373 } else {
1374 IntrinsicCostAttributes ScalarICA(
1375 II->getIntrinsicID(), ScalarTy,
1376 SmallVector<Type *>(II->arg_size(), ScalarTy));
1377 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1378 IntrinsicCostAttributes VectorICA(
1379 II->getIntrinsicID(), VecTy,
1380 SmallVector<Type *>(II->arg_size(), VecTy));
1381 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1382 }
1383
1384 // Fold the vector constants in the original vectors into a new base vector to
1385 // get more accurate cost modelling.
1386 Value *NewVecC = nullptr;
1387 if (CI)
1388 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1389 else if (UO)
1390 NewVecC =
1391 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1392 else if (BO)
1393 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1394 else if (II)
1395 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1396
1397 if (!NewVecC)
1398 return false;
1399
1400 // Get cost estimate for the insert element. This cost will factor into
1401 // both sequences.
1402 InstructionCost OldCost = VectorOpCost;
1403 InstructionCost NewCost =
1404 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1405 CostKind, *Index, NewVecC);
1406
1407 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1408 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1409 II->getIntrinsicID(), Idx, &TTI)))
1410 continue;
1412 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1413 OldCost += InsertCost;
1414 NewCost += !Op->hasOneUse() * InsertCost;
1415 }
1416
1417 // We want to scalarize unless the vector variant actually has lower cost.
1418 if (OldCost < NewCost || !NewCost.isValid())
1419 return false;
1420
1421 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1422 // inselt NewVecC, (scalar_op V0, V1), Index
1423 if (CI)
1424 ++NumScalarCmp;
1425 else if (UO || BO)
1426 ++NumScalarOps;
1427 else
1428 ++NumScalarIntrinsic;
1429
1430 // For constant cases, extract the scalar element, this should constant fold.
1431 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1432 if (!Scalar)
1434 cast<Constant>(VecC), Builder.getInt64(*Index));
1435
1436 Value *Scalar;
1437 if (CI)
1438 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1439 else if (UO || BO)
1440 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1441 else
1442 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1443
1444 Scalar->setName(I.getName() + ".scalar");
1445
1446 // All IR flags are safe to back-propagate. There is no potential for extra
1447 // poison to be created by the scalar instruction.
1448 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1449 ScalarInst->copyIRFlags(&I);
1450
1451 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1452 replaceValue(I, *Insert);
1453 return true;
1454}
1455
1456/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1457/// a vector into vector operations followed by extract. Note: The SLP pass
1458/// may miss this pattern because of implementation problems.
1459bool VectorCombine::foldExtractedCmps(Instruction &I) {
1460 auto *BI = dyn_cast<BinaryOperator>(&I);
1461
1462 // We are looking for a scalar binop of booleans.
1463 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1464 if (!BI || !I.getType()->isIntegerTy(1))
1465 return false;
1466
1467 // The compare predicates should match, and each compare should have a
1468 // constant operand.
1469 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1470 Instruction *I0, *I1;
1471 Constant *C0, *C1;
1472 CmpPredicate P0, P1;
1473 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1474 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1475 return false;
1476
1477 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1478 if (!MatchingPred)
1479 return false;
1480
1481 // The compare operands must be extracts of the same vector with constant
1482 // extract indexes.
1483 Value *X;
1484 uint64_t Index0, Index1;
1485 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1486 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1487 return false;
1488
1489 auto *Ext0 = cast<ExtractElementInst>(I0);
1490 auto *Ext1 = cast<ExtractElementInst>(I1);
1491 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1492 if (!ConvertToShuf)
1493 return false;
1494 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1495 "Unknown ExtractElementInst");
1496
1497 // The original scalar pattern is:
1498 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1499 CmpInst::Predicate Pred = *MatchingPred;
1500 unsigned CmpOpcode =
1501 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1502 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1503 if (!VecTy)
1504 return false;
1505
1506 InstructionCost Ext0Cost =
1507 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1508 InstructionCost Ext1Cost =
1509 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1511 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1512 CostKind);
1513
1514 InstructionCost OldCost =
1515 Ext0Cost + Ext1Cost + CmpCost * 2 +
1516 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1517
1518 // The proposed vector pattern is:
1519 // vcmp = cmp Pred X, VecC
1520 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1521 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1522 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1525 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1526 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1527 ShufMask[CheapIndex] = ExpensiveIndex;
1529 CmpTy, ShufMask, CostKind);
1530 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1531 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1532 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1533 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1534
1535 // Aggressively form vector ops if the cost is equal because the transform
1536 // may enable further optimization.
1537 // Codegen can reverse this transform (scalarize) if it was not profitable.
1538 if (OldCost < NewCost || !NewCost.isValid())
1539 return false;
1540
1541 // Create a vector constant from the 2 scalar constants.
1542 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1543 PoisonValue::get(VecTy->getElementType()));
1544 CmpC[Index0] = C0;
1545 CmpC[Index1] = C1;
1546 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1547 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1548 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1549 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1550 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1551 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1552 replaceValue(I, *NewExt);
1553 ++NumVecCmpBO;
1554 return true;
1555}
1556
1557/// Try to fold scalar selects that select between extracted elements and zero
1558/// into extracting from a vector select. This is rooted at the bitcast.
1559///
1560/// This pattern arises when a vector is bitcast to a smaller element type,
1561/// elements are extracted, and then conditionally selected with zero:
1562///
1563/// %bc = bitcast <4 x i32> %src to <16 x i8>
1564/// %e0 = extractelement <16 x i8> %bc, i32 0
1565/// %s0 = select i1 %cond, i8 %e0, i8 0
1566/// %e1 = extractelement <16 x i8> %bc, i32 1
1567/// %s1 = select i1 %cond, i8 %e1, i8 0
1568/// ...
1569///
1570/// Transforms to:
1571/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1572/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1573/// %e0 = extractelement <16 x i8> %bc, i32 0
1574/// %e1 = extractelement <16 x i8> %bc, i32 1
1575/// ...
1576///
1577/// This is profitable because vector select on wider types produces fewer
1578/// select/cndmask instructions than scalar selects on each element.
1579bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1580 auto *BC = dyn_cast<BitCastInst>(&I);
1581 if (!BC)
1582 return false;
1583
1584 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1585 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1586 if (!SrcVecTy || !DstVecTy)
1587 return false;
1588
1589 // Source must be 32-bit or 64-bit elements, destination must be smaller
1590 // integer elements. Zero in all these types is all-bits-zero.
1591 Type *SrcEltTy = SrcVecTy->getElementType();
1592 Type *DstEltTy = DstVecTy->getElementType();
1593 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1594 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1595
1596 if (SrcEltBits != 32 && SrcEltBits != 64)
1597 return false;
1598
1599 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1600 return false;
1601
1602 // Check profitability using TTI before collecting users.
1603 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1604 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1605
1606 InstructionCost ScalarSelCost =
1607 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1609 InstructionCost VecSelCost =
1610 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1612
1613 // We need at least this many selects for vectorization to be profitable.
1614 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1615 // ScalarSelCost
1616 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1617 return false;
1618
1619 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1620
1621 // Quick check: if bitcast doesn't have enough users, bail early.
1622 if (!BC->hasNUsesOrMore(MinSelects))
1623 return false;
1624
1625 // Collect all select users that match the pattern, grouped by condition.
1626 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1627 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1628
1629 for (User *U : BC->users()) {
1630 auto *Ext = dyn_cast<ExtractElementInst>(U);
1631 if (!Ext)
1632 continue;
1633
1634 for (User *ExtUser : Ext->users()) {
1635 Value *Cond;
1636 // Match: select i1 %cond, %ext, 0
1637 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1638 Cond->getType()->isIntegerTy(1))
1639 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1640 }
1641 }
1642
1643 if (CondToSelects.empty())
1644 return false;
1645
1646 bool MadeChange = false;
1647 Value *SrcVec = BC->getOperand(0);
1648
1649 // Process each group of selects with the same condition.
1650 for (auto [Cond, Selects] : CondToSelects) {
1651 // Only profitable if vector select cost < total scalar select cost.
1652 if (Selects.size() < MinSelects) {
1653 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1654 << "profitable (VecCost=" << VecSelCost
1655 << ", ScalarCost=" << ScalarSelCost
1656 << ", NumSelects=" << Selects.size() << ")\n");
1657 continue;
1658 }
1659
1660 // Create the vector select and bitcast once for this condition.
1661 auto InsertPt = std::next(BC->getIterator());
1662
1663 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1664 if (DT.dominates(BC, CondInst))
1665 InsertPt = std::next(CondInst->getIterator());
1666
1667 Builder.SetInsertPoint(InsertPt);
1668 Value *VecSel =
1669 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1670 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1671
1672 // Replace each scalar select with an extract from the new bitcast.
1673 for (SelectInst *Sel : Selects) {
1674 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1675 Value *Idx = Ext->getIndexOperand();
1676
1677 Builder.SetInsertPoint(Sel);
1678 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1679 replaceValue(*Sel, *NewExt);
1680 MadeChange = true;
1681 }
1682
1683 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1684 << " selects into vector select\n");
1685 }
1686
1687 return MadeChange;
1688}
1689
1692 const TargetTransformInfo &TTI,
1693 InstructionCost &CostBeforeReduction,
1694 InstructionCost &CostAfterReduction) {
1695 Instruction *Op0, *Op1;
1696 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1697 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1698 unsigned ReductionOpc =
1699 getArithmeticReductionInstruction(II.getIntrinsicID());
1700 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1701 bool IsUnsigned = isa<ZExtInst>(RedOp);
1702 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1703
1704 CostBeforeReduction =
1705 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1707 CostAfterReduction =
1708 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1709 ExtType, FastMathFlags(), CostKind);
1710 return;
1711 }
1712 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1713 match(RedOp,
1715 match(Op0, m_ZExtOrSExt(m_Value())) &&
1716 Op0->getOpcode() == Op1->getOpcode() &&
1717 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1718 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1719 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1720 bool IsUnsigned = isa<ZExtInst>(Op0);
1721 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1722 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1723
1724 InstructionCost ExtCost =
1725 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1727 InstructionCost MulCost =
1728 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1729 InstructionCost Ext2Cost =
1730 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1732
1733 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1734 CostAfterReduction = TTI.getMulAccReductionCost(
1735 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1736 return;
1737 }
1738 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1739 std::nullopt, CostKind);
1740}
1741
1742bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1743 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1744 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1745 if (BinOpOpc == Instruction::Sub)
1746 ReductionIID = Intrinsic::vector_reduce_add;
1747 if (ReductionIID == Intrinsic::not_intrinsic)
1748 return false;
1749
1750 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1751 Intrinsic::ID IID) -> Value * {
1752 auto *II = dyn_cast<IntrinsicInst>(V);
1753 if (!II)
1754 return nullptr;
1755 if (II->getIntrinsicID() == IID && II->hasOneUse())
1756 return II->getArgOperand(0);
1757 return nullptr;
1758 };
1759
1760 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1761 if (!V0)
1762 return false;
1763 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1764 if (!V1)
1765 return false;
1766
1767 auto *VTy = cast<VectorType>(V0->getType());
1768 if (V1->getType() != VTy)
1769 return false;
1770 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1771 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1772 unsigned ReductionOpc =
1773 getArithmeticReductionInstruction(II0.getIntrinsicID());
1774
1775 InstructionCost OldCost = 0;
1776 InstructionCost NewCost = 0;
1777 InstructionCost CostOfRedOperand0 = 0;
1778 InstructionCost CostOfRed0 = 0;
1779 InstructionCost CostOfRedOperand1 = 0;
1780 InstructionCost CostOfRed1 = 0;
1781 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1782 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1783 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1784 NewCost =
1785 CostOfRedOperand0 + CostOfRedOperand1 +
1786 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1787 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1788 if (NewCost >= OldCost || !NewCost.isValid())
1789 return false;
1790
1791 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1792 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1793 << "\n");
1794 Value *VectorBO;
1795 if (BinOpOpc == Instruction::Or)
1796 VectorBO = Builder.CreateOr(V0, V1, "",
1797 cast<PossiblyDisjointInst>(I).isDisjoint());
1798 else
1799 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1800
1801 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1802 replaceValue(I, *Rdx);
1803 return true;
1804}
1805
1806// Check if memory loc modified between two instrs in the same BB
1809 const MemoryLocation &Loc, AAResults &AA) {
1810 unsigned NumScanned = 0;
1811 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1812 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1813 ++NumScanned > MaxInstrsToScan;
1814 });
1815}
1816
1817namespace {
1818/// Helper class to indicate whether a vector index can be safely scalarized and
1819/// if a freeze needs to be inserted.
1820class ScalarizationResult {
1821 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1822
1823 StatusTy Status;
1824 Value *ToFreeze;
1825
1826 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1827 : Status(Status), ToFreeze(ToFreeze) {}
1828
1829public:
1830 ScalarizationResult(const ScalarizationResult &Other) = default;
1831 ~ScalarizationResult() {
1832 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1833 }
1834
1835 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1836 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1837 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1838 return {StatusTy::SafeWithFreeze, ToFreeze};
1839 }
1840
1841 /// Returns true if the index can be scalarize without requiring a freeze.
1842 bool isSafe() const { return Status == StatusTy::Safe; }
1843 /// Returns true if the index cannot be scalarized.
1844 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1845 /// Returns true if the index can be scalarize, but requires inserting a
1846 /// freeze.
1847 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1848
1849 /// Reset the state of Unsafe and clear ToFreze if set.
1850 void discard() {
1851 ToFreeze = nullptr;
1852 Status = StatusTy::Unsafe;
1853 }
1854
1855 /// Freeze the ToFreeze and update the use in \p User to use it.
1856 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1857 assert(isSafeWithFreeze() &&
1858 "should only be used when freezing is required");
1859 assert(is_contained(ToFreeze->users(), &UserI) &&
1860 "UserI must be a user of ToFreeze");
1861 IRBuilder<>::InsertPointGuard Guard(Builder);
1862 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1863 Value *Frozen =
1864 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1865 for (Use &U : make_early_inc_range((UserI.operands())))
1866 if (U.get() == ToFreeze)
1867 U.set(Frozen);
1868
1869 ToFreeze = nullptr;
1870 }
1871};
1872} // namespace
1873
1874/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1875/// Idx. \p Idx must access a valid vector element.
1876static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1877 const SimplifyQuery &SQ) {
1878 // We do checks for both fixed vector types and scalable vector types.
1879 // This is the number of elements of fixed vector types,
1880 // or the minimum number of elements of scalable vector types.
1881 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1882 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1883
1884 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1885 if (C->getValue().ult(NumElements))
1886 return ScalarizationResult::safe();
1887 return ScalarizationResult::unsafe();
1888 }
1889
1890 // Always unsafe if the index type can't handle all inbound values.
1891 if (!llvm::isUIntN(IntWidth, NumElements))
1892 return ScalarizationResult::unsafe();
1893
1894 APInt Zero(IntWidth, 0);
1895 APInt MaxElts(IntWidth, NumElements);
1896 ConstantRange ValidIndices(Zero, MaxElts);
1897 ConstantRange IdxRange(IntWidth, true);
1898
1899 if (isGuaranteedNotToBePoison(Idx, SQ.AC, SQ.CxtI, SQ.DT)) {
1900 if (ValidIndices.contains(
1901 computeConstantRange(Idx, /*ForSigned=*/false, SQ)))
1902 return ScalarizationResult::safe();
1903 return ScalarizationResult::unsafe();
1904 }
1905
1906 // If the index may be poison, check if we can insert a freeze before the
1907 // range of the index is restricted.
1908 Value *IdxBase;
1909 ConstantInt *CI;
1910 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1911 IdxRange = IdxRange.binaryAnd(CI->getValue());
1912 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1913 IdxRange = IdxRange.urem(CI->getValue());
1914 }
1915
1916 if (ValidIndices.contains(IdxRange))
1917 return ScalarizationResult::safeWithFreeze(IdxBase);
1918 return ScalarizationResult::unsafe();
1919}
1920
1921/// The memory operation on a vector of \p ScalarType had alignment of
1922/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1923/// alignment that will be valid for the memory operation on a single scalar
1924/// element of the same type with index \p Idx.
1926 Type *ScalarType, Value *Idx,
1927 const DataLayout &DL) {
1928 if (auto *C = dyn_cast<ConstantInt>(Idx))
1929 return commonAlignment(VectorAlignment,
1930 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1931 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1932}
1933
1934// Combine patterns like:
1935// %0 = load <4 x i32>, <4 x i32>* %a
1936// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1937// store <4 x i32> %1, <4 x i32>* %a
1938// to:
1939// %0 = bitcast <4 x i32>* %a to i32*
1940// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1941// store i32 %b, i32* %1
1942bool VectorCombine::foldSingleElementStore(Instruction &I) {
1944 return false;
1945 auto *SI = cast<StoreInst>(&I);
1946 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1947 return false;
1948
1949 // TODO: Combine more complicated patterns (multiple insert) by referencing
1950 // TargetTransformInfo.
1952 Value *NewElement;
1953 Value *Idx;
1954 if (!match(SI->getValueOperand(),
1955 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1956 m_Value(Idx))))
1957 return false;
1958
1959 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1960 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1961 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1962 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1963 // modified between, vector type matches store size, and index is inbounds.
1964 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1965 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1966 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1967 return false;
1968
1969 auto ScalarizableIdx =
1970 canScalarizeAccess(VecTy, Idx, SQ.getWithInstruction(Load));
1971 if (ScalarizableIdx.isUnsafe() ||
1972 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1973 MemoryLocation::get(SI), AA))
1974 return false;
1975
1976 // Ensure we add the load back to the worklist BEFORE its users so they can
1977 // erased in the correct order.
1978 Worklist.push(Load);
1979
1980 if (ScalarizableIdx.isSafeWithFreeze())
1981 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1982 Value *GEP = Builder.CreateInBoundsGEP(
1983 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1984 {ConstantInt::get(Idx->getType(), 0), Idx});
1985 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1986 NSI->copyMetadata(*SI);
1987 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1988 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1989 *DL);
1990 NSI->setAlignment(ScalarOpAlignment);
1991 replaceValue(I, *NSI);
1993 return true;
1994 }
1995
1996 return false;
1997}
1998
1999/// Try to scalarize vector loads feeding extractelement or bitcast
2000/// instructions.
2001bool VectorCombine::scalarizeLoad(Instruction &I) {
2002 Value *Ptr;
2003 if (!match(&I, m_Load(m_Value(Ptr))))
2004 return false;
2005
2006 auto *LI = cast<LoadInst>(&I);
2007 auto *VecTy = cast<VectorType>(LI->getType());
2008 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2009 return false;
2010
2011 bool AllExtracts = true;
2012 bool AllBitcasts = true;
2013 Instruction *LastCheckedInst = LI;
2014 unsigned NumInstChecked = 0;
2015
2016 // Check what type of users we have (must either all be extracts or
2017 // bitcasts) and ensure no memory modifications between the load and
2018 // its users.
2019 for (User *U : LI->users()) {
2020 auto *UI = dyn_cast<Instruction>(U);
2021 if (!UI || UI->getParent() != LI->getParent())
2022 return false;
2023
2024 // If any user is waiting to be erased, then bail out as this will
2025 // distort the cost calculation and possibly lead to infinite loops.
2026 if (UI->use_empty())
2027 return false;
2028
2029 if (!isa<ExtractElementInst>(UI))
2030 AllExtracts = false;
2031 if (!isa<BitCastInst>(UI))
2032 AllBitcasts = false;
2033
2034 // Check if any instruction between the load and the user may modify memory.
2035 if (LastCheckedInst->comesBefore(UI)) {
2036 for (Instruction &I :
2037 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2038 // Bail out if we reached the check limit or the instruction may write
2039 // to memory.
2040 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2041 return false;
2042 NumInstChecked++;
2043 }
2044 LastCheckedInst = UI;
2045 }
2046 }
2047
2048 if (AllExtracts)
2049 return scalarizeLoadExtract(LI, VecTy, Ptr);
2050 if (AllBitcasts)
2051 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2052 return false;
2053}
2054
2055/// Try to scalarize vector loads feeding extractelement instructions.
2056bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2057 Value *Ptr) {
2059 return false;
2060
2061 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2062 llvm::scope_exit FailureGuard([&]() {
2063 // If the transform is aborted, discard the ScalarizationResults.
2064 for (auto &Pair : NeedFreeze)
2065 Pair.second.discard();
2066 });
2067
2068 InstructionCost OriginalCost =
2069 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2071 InstructionCost ScalarizedCost = 0;
2072
2073 for (User *U : LI->users()) {
2074 auto *UI = cast<ExtractElementInst>(U);
2075
2076 auto ScalarIdx = canScalarizeAccess(VecTy, UI->getIndexOperand(),
2077 SQ.getWithInstruction(LI));
2078 if (ScalarIdx.isUnsafe())
2079 return false;
2080 if (ScalarIdx.isSafeWithFreeze()) {
2081 NeedFreeze.try_emplace(UI, ScalarIdx);
2082 ScalarIdx.discard();
2083 }
2084
2085 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2086 OriginalCost +=
2087 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2088 Index ? Index->getZExtValue() : -1);
2089 ScalarizedCost +=
2090 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2092 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2093 nullptr, nullptr, CostKind);
2094 }
2095
2096 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2097 << "\n LoadExtractCost: " << OriginalCost
2098 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2099
2100 if (ScalarizedCost >= OriginalCost)
2101 return false;
2102
2103 // Ensure we add the load back to the worklist BEFORE its users so they can
2104 // erased in the correct order.
2105 Worklist.push(LI);
2106
2107 Type *ElemType = VecTy->getElementType();
2108
2109 // Replace extracts with narrow scalar loads.
2110 for (User *U : LI->users()) {
2111 auto *EI = cast<ExtractElementInst>(U);
2112 Value *Idx = EI->getIndexOperand();
2113
2114 // Insert 'freeze' for poison indexes.
2115 auto It = NeedFreeze.find(EI);
2116 if (It != NeedFreeze.end())
2117 It->second.freeze(Builder, *cast<Instruction>(Idx));
2118
2119 Builder.SetInsertPoint(EI);
2120 Value *GEP =
2121 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2122 auto *NewLoad = cast<LoadInst>(
2123 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2124
2125 Align ScalarOpAlignment =
2126 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2127 NewLoad->setAlignment(ScalarOpAlignment);
2128
2129 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2130 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2131 AAMDNodes OldAAMD = LI->getAAMetadata();
2132 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2133 }
2134
2135 replaceValue(*EI, *NewLoad, false);
2136 }
2137
2138 FailureGuard.release();
2139 return true;
2140}
2141
2142/// Try to scalarize vector loads feeding bitcast instructions.
2143bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2144 Value *Ptr) {
2145 InstructionCost OriginalCost =
2146 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2148
2149 Type *TargetScalarType = nullptr;
2150 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2151
2152 for (User *U : LI->users()) {
2153 auto *BC = cast<BitCastInst>(U);
2154
2155 Type *DestTy = BC->getDestTy();
2156 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2157 return false;
2158
2159 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2160 if (DestBitWidth != VecBitWidth)
2161 return false;
2162
2163 // All bitcasts must target the same scalar type.
2164 if (!TargetScalarType)
2165 TargetScalarType = DestTy;
2166 else if (TargetScalarType != DestTy)
2167 return false;
2168
2169 OriginalCost +=
2170 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2172 }
2173
2174 if (!TargetScalarType)
2175 return false;
2176
2177 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2178 InstructionCost ScalarizedCost =
2179 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2181
2182 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2183 << "\n OriginalCost: " << OriginalCost
2184 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2185
2186 if (ScalarizedCost >= OriginalCost)
2187 return false;
2188
2189 // Ensure we add the load back to the worklist BEFORE its users so they can
2190 // erased in the correct order.
2191 Worklist.push(LI);
2192
2193 Builder.SetInsertPoint(LI);
2194 auto *ScalarLoad =
2195 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2196 ScalarLoad->setAlignment(LI->getAlign());
2197 ScalarLoad->copyMetadata(*LI);
2198
2199 // Replace all bitcast users with the scalar load.
2200 for (User *U : LI->users()) {
2201 auto *BC = cast<BitCastInst>(U);
2202 replaceValue(*BC, *ScalarLoad, false);
2203 }
2204
2205 return true;
2206}
2207
2208bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2210 return false;
2211 auto *Ext = dyn_cast<ZExtInst>(&I);
2212 if (!Ext)
2213 return false;
2214
2215 // Try to convert a vector zext feeding only extracts to a set of scalar
2216 // (Src << ExtIdx *Size) & (Size -1)
2217 // if profitable .
2218 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2219 if (!SrcTy)
2220 return false;
2221 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2222
2223 Type *ScalarDstTy = DstTy->getElementType();
2224 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2225 return false;
2226
2227 InstructionCost VectorCost =
2228 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2230 unsigned ExtCnt = 0;
2231 bool ExtLane0 = false;
2232 for (User *U : Ext->users()) {
2233 uint64_t Idx;
2234 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2235 return false;
2236 if (cast<Instruction>(U)->use_empty())
2237 continue;
2238 ExtCnt += 1;
2239 ExtLane0 |= !Idx;
2240 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2241 CostKind, Idx, U);
2242 }
2243
2244 InstructionCost ScalarCost =
2245 ExtCnt * TTI.getArithmeticInstrCost(
2246 Instruction::And, ScalarDstTy, CostKind,
2249 (ExtCnt - ExtLane0) *
2251 Instruction::LShr, ScalarDstTy, CostKind,
2254 if (ScalarCost > VectorCost)
2255 return false;
2256
2257 Value *ScalarV = Ext->getOperand(0);
2258 if (!isGuaranteedNotToBePoison(ScalarV, SQ.AC, dyn_cast<Instruction>(ScalarV),
2259 SQ.DT)) {
2260 // Check wether all lanes are extracted, all extracts trigger UB
2261 // on poison, and the last extract (and hence all previous ones)
2262 // are guaranteed to execute if Ext executes. If so, we do not
2263 // need to insert a freeze.
2264 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2265 bool AllExtractsTriggerUB = true;
2266 ExtractElementInst *LastExtract = nullptr;
2267 BasicBlock *ExtBB = Ext->getParent();
2268 for (User *U : Ext->users()) {
2269 auto *Extract = cast<ExtractElementInst>(U);
2270 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2271 AllExtractsTriggerUB = false;
2272 break;
2273 }
2274 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2275 if (!LastExtract || LastExtract->comesBefore(Extract))
2276 LastExtract = Extract;
2277 }
2278 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2279 !AllExtractsTriggerUB ||
2281 LastExtract->getIterator()))
2282 ScalarV = Builder.CreateFreeze(ScalarV);
2283 }
2284 ScalarV = Builder.CreateBitCast(
2285 ScalarV,
2286 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2287 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2288 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2289 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2290 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2291 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2292 for (User *U : Ext->users()) {
2293 auto *Extract = cast<ExtractElementInst>(U);
2294 uint64_t Idx =
2295 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2296 uint64_t ShiftAmt =
2297 DL->isBigEndian()
2298 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2299 : (Idx * SrcEltSizeInBits);
2300 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2301 Value *And = Builder.CreateAnd(LShr, Mask);
2302 U->replaceAllUsesWith(And);
2303 }
2304 return true;
2305}
2306
2307/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2308/// to "(bitcast (concat X, Y))"
2309/// where X/Y are bitcasted from i1 mask vectors.
2310bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2311 Type *Ty = I.getType();
2312 if (!Ty->isIntegerTy())
2313 return false;
2314
2315 // TODO: Add big endian test coverage
2316 if (DL->isBigEndian())
2317 return false;
2318
2319 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2320 Instruction *X, *Y;
2322 return false;
2323
2324 // Allow both sources to contain shl, to handle more generic pattern:
2325 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2326 Value *SrcX;
2327 uint64_t ShAmtX = 0;
2328 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2329 !match(X, m_OneUse(
2331 m_ConstantInt(ShAmtX)))))
2332 return false;
2333
2334 Value *SrcY;
2335 uint64_t ShAmtY = 0;
2336 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2337 !match(Y, m_OneUse(
2339 m_ConstantInt(ShAmtY)))))
2340 return false;
2341
2342 // Canonicalize larger shift to the RHS.
2343 if (ShAmtX > ShAmtY) {
2344 std::swap(X, Y);
2345 std::swap(SrcX, SrcY);
2346 std::swap(ShAmtX, ShAmtY);
2347 }
2348
2349 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2350 // difference is the mask width so they can be easily concatenated together.
2351 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2352 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2353 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2354 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2355 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2356 !MaskTy->getElementType()->isIntegerTy(1) ||
2357 MaskTy->getNumElements() != ShAmtDiff ||
2358 MaskTy->getNumElements() > (BitWidth / 2))
2359 return false;
2360
2361 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2362 auto *ConcatIntTy =
2363 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2364 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2365
2366 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2367 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2368
2369 // TODO: Is it worth supporting multi use cases?
2370 InstructionCost OldCost = 0;
2371 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2372 OldCost +=
2373 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2374 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2376 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2378
2379 InstructionCost NewCost = 0;
2381 MaskTy, ConcatMask, CostKind);
2382 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2384 if (Ty != ConcatIntTy)
2385 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2387 if (ShAmtX > 0)
2388 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2389
2390 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2391 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2392 << "\n");
2393
2394 if (NewCost > OldCost)
2395 return false;
2396
2397 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2398 // any residual zero-extension or shifting.
2399 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2400 Worklist.pushValue(Concat);
2401
2402 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2403
2404 if (Ty != ConcatIntTy) {
2405 Worklist.pushValue(Result);
2406 Result = Builder.CreateZExt(Result, Ty);
2407 }
2408
2409 if (ShAmtX > 0) {
2410 Worklist.pushValue(Result);
2411 Result = Builder.CreateShl(Result, ShAmtX);
2412 }
2413
2414 replaceValue(I, *Result);
2415 return true;
2416}
2417
2418/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2419/// --> "binop (shuffle), (shuffle)".
2420bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2421 BinaryOperator *BinOp;
2422 ArrayRef<int> OuterMask;
2423 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2424 return false;
2425
2426 // Don't introduce poison into div/rem.
2427 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2428 return false;
2429
2430 Value *Op00, *Op01, *Op10, *Op11;
2431 ArrayRef<int> Mask0, Mask1;
2432 bool Match0 = match(BinOp->getOperand(0),
2433 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2434 bool Match1 = match(BinOp->getOperand(1),
2435 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2436 if (!Match0 && !Match1)
2437 return false;
2438
2439 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2440 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2441 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2442 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2443
2444 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2445 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2446 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2447 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2448 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2449 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2450 return false;
2451
2452 unsigned NumSrcElts = BinOpTy->getNumElements();
2453
2454 // Don't accept shuffles that reference the second operand in
2455 // div/rem or if its an undef arg.
2456 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2457 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2458 return false;
2459
2460 // Merge outer / inner (or identity if no match) shuffles.
2461 SmallVector<int> NewMask0, NewMask1;
2462 for (int M : OuterMask) {
2463 if (M < 0 || M >= (int)NumSrcElts) {
2464 NewMask0.push_back(PoisonMaskElem);
2465 NewMask1.push_back(PoisonMaskElem);
2466 } else {
2467 NewMask0.push_back(Match0 ? Mask0[M] : M);
2468 NewMask1.push_back(Match1 ? Mask1[M] : M);
2469 }
2470 }
2471
2472 unsigned NumOpElts = Op0Ty->getNumElements();
2473 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2474 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2475 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2476 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2477 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2478 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2479
2480 InstructionCost NewCost = 0;
2481 // Try to merge shuffles across the binop if the new shuffles are not costly.
2482 InstructionCost BinOpCost =
2483 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2484 InstructionCost OldCost =
2486 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2487 0, nullptr, {BinOp}, &I);
2488 if (!BinOp->hasOneUse())
2489 NewCost += BinOpCost;
2490
2491 if (Match0) {
2493 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2494 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2495 OldCost += Shuf0Cost;
2496 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2497 NewCost += Shuf0Cost;
2498 }
2499 if (Match1) {
2501 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2502 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2503 OldCost += Shuf1Cost;
2504 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2505 NewCost += Shuf1Cost;
2506 }
2507
2508 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2509
2510 if (!IsIdentity0)
2511 NewCost +=
2513 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2514 if (!IsIdentity1)
2515 NewCost +=
2517 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2518
2519 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2520 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2521 << "\n");
2522
2523 // If costs are equal, still fold as we reduce instruction count.
2524 if (NewCost > OldCost)
2525 return false;
2526
2527 Value *LHS =
2528 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2529 Value *RHS =
2530 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2531 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2532
2533 // Intersect flags from the old binops.
2534 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2535 NewInst->copyIRFlags(BinOp);
2536
2537 Worklist.pushValue(LHS);
2538 Worklist.pushValue(RHS);
2539 replaceValue(I, *NewBO);
2540 return true;
2541}
2542
2543/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2544/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2545bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2546 ArrayRef<int> OldMask;
2547 Instruction *LHS, *RHS;
2549 m_Mask(OldMask))))
2550 return false;
2551
2552 // TODO: Add support for addlike etc.
2553 if (LHS->getOpcode() != RHS->getOpcode())
2554 return false;
2555
2556 Value *X, *Y, *Z, *W;
2557 bool IsCommutative = false;
2558 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2559 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2560 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2561 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2562 auto *BO = cast<BinaryOperator>(LHS);
2563 // Don't introduce poison into div/rem.
2564 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2565 return false;
2566 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2567 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2568 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2569 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2570 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2571 } else
2572 return false;
2573
2574 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2575 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2576 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2577 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2578 return false;
2579
2580 bool SameBinOp = LHS == RHS;
2581 unsigned NumSrcElts = BinOpTy->getNumElements();
2582
2583 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2584 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2585 std::swap(X, Y);
2586
2587 auto ConvertToUnary = [NumSrcElts](int &M) {
2588 if (M >= (int)NumSrcElts)
2589 M -= NumSrcElts;
2590 };
2591
2592 SmallVector<int> NewMask0(OldMask);
2594 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2595 if (X == Z) {
2596 llvm::for_each(NewMask0, ConvertToUnary);
2598 Z = PoisonValue::get(BinOpTy);
2599 }
2600
2601 SmallVector<int> NewMask1(OldMask);
2603 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2604 if (Y == W) {
2605 llvm::for_each(NewMask1, ConvertToUnary);
2607 W = PoisonValue::get(BinOpTy);
2608 }
2609
2610 // Try to replace a binop with a shuffle if the shuffle is not costly.
2611 // When SameBinOp, only count the binop cost once.
2614
2615 InstructionCost OldCost = LHSCost;
2616 if (!SameBinOp) {
2617 OldCost += RHSCost;
2618 }
2620 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2621 nullptr, {LHS, RHS}, &I);
2622
2623 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2624 // where one use shuffles have gotten split across the binop/cmp. These
2625 // often allow a major reduction in total cost that wouldn't happen as
2626 // individual folds.
2627 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2628 TTI::TargetCostKind CostKind) -> bool {
2629 Value *InnerOp;
2630 ArrayRef<int> InnerMask;
2631 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2632 m_Mask(InnerMask)))) &&
2633 InnerOp->getType() == Op->getType() &&
2634 all_of(InnerMask,
2635 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2636 for (int &M : Mask)
2637 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2638 M = InnerMask[M - Offset];
2639 M = 0 <= M ? M + Offset : M;
2640 }
2642 Op = InnerOp;
2643 return true;
2644 }
2645 return false;
2646 };
2647 bool ReducedInstCount = false;
2648 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2649 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2650 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2651 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2652 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2653 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2654 // original binop(s). If binops have multiple uses, they won't be eliminated.
2655 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2656
2657 auto *ShuffleCmpTy =
2658 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2660 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2661 if (!SingleSrcBinOp)
2662 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2663 CostKind, 0, nullptr, {Y, W});
2664
2665 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2666 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2667 CostKind, Op0Info, Op1Info);
2668 } else {
2669 NewCost +=
2670 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2671 PredLHS, CostKind, Op0Info, Op1Info);
2672 }
2673 // If LHS/RHS have other uses, we need to account for the cost of keeping
2674 // the original instructions. When SameBinOp, only add the cost once.
2675 if (!LHS->hasOneUser())
2676 NewCost += LHSCost;
2677 if (!SameBinOp && !RHS->hasOneUser())
2678 NewCost += RHSCost;
2679
2680 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2681 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2682 << "\n");
2683
2684 // If either shuffle will constant fold away, then fold for the same cost as
2685 // we will reduce the instruction count.
2686 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2687 (isa<Constant>(Y) && isa<Constant>(W));
2688 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2689 return false;
2690
2691 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2692 Value *Shuf1 =
2693 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2694 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2695 ? Builder.CreateBinOp(
2696 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2697 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2698
2699 // Intersect flags from the old binops.
2700 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2701 NewInst->copyIRFlags(LHS);
2702 NewInst->andIRFlags(RHS);
2703 }
2704
2705 Worklist.pushValue(Shuf0);
2706 Worklist.pushValue(Shuf1);
2707 replaceValue(I, *NewBO);
2708 return true;
2709}
2710
2711/// Try to convert,
2712/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2713/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2714bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2715 ArrayRef<int> Mask;
2716 Value *C1, *T1, *F1, *C2, *T2, *F2;
2717 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2718 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2719 m_Mask(Mask))))
2720 return false;
2721
2722 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2723 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2724
2725 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2726 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2727 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2728 return false;
2729
2730 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2731 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2732 // SelectInsts must have the same FMF.
2733 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2734 ((SI0FOp != nullptr) &&
2735 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2736 return false;
2737
2738 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2739 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2741 auto SelOp = Instruction::Select;
2742
2744 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2746 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2747
2748 InstructionCost OldCost =
2749 CostSel1 + CostSel2 +
2750 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2751 {I.getOperand(0), I.getOperand(1)}, &I);
2752
2754 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2755 Mask, CostKind, 0, nullptr, {C1, C2});
2756 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2757 nullptr, {T1, T2});
2758 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2759 nullptr, {F1, F2});
2760 auto *C1C2ShuffledVecTy = FixedVectorType::get(
2761 Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements());
2762 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2764
2765 if (!Sel1->hasOneUse())
2766 NewCost += CostSel1;
2767 if (!Sel2->hasOneUse())
2768 NewCost += CostSel2;
2769
2770 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2771 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2772 << "\n");
2773 if (NewCost > OldCost)
2774 return false;
2775
2776 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2777 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2778 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2779 Value *NewSel;
2780 // We presuppose that the SelectInsts have the same FMF.
2781 if (SI0FOp)
2782 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2783 SI0FOp->getFastMathFlags());
2784 else
2785 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2786
2787 Worklist.pushValue(ShuffleCmp);
2788 Worklist.pushValue(ShuffleTrue);
2789 Worklist.pushValue(ShuffleFalse);
2790 replaceValue(I, *NewSel);
2791 return true;
2792}
2793
2794/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2795/// into "castop (shuffle)".
2796bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2797 Value *V0, *V1;
2798 ArrayRef<int> OldMask;
2799 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2800 return false;
2801
2802 // Check whether this is a binary shuffle.
2803 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2804
2805 auto *C0 = dyn_cast<CastInst>(V0);
2806 auto *C1 = dyn_cast<CastInst>(V1);
2807 if (!C0 || (IsBinaryShuffle && !C1))
2808 return false;
2809
2810 Instruction::CastOps Opcode = C0->getOpcode();
2811
2812 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2813 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2814 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2815 return false;
2816
2817 if (IsBinaryShuffle) {
2818 if (C0->getSrcTy() != C1->getSrcTy())
2819 return false;
2820 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2821 if (Opcode != C1->getOpcode()) {
2822 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2823 Opcode = Instruction::SExt;
2824 else
2825 return false;
2826 }
2827 }
2828
2829 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2830 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2831 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2832 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2833 return false;
2834
2835 unsigned NumSrcElts = CastSrcTy->getNumElements();
2836 unsigned NumDstElts = CastDstTy->getNumElements();
2837 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2838 "Only bitcasts expected to alter src/dst element counts");
2839
2840 // Check for bitcasting of unscalable vector types.
2841 // e.g. <32 x i40> -> <40 x i32>
2842 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2843 (NumDstElts % NumSrcElts) != 0)
2844 return false;
2845
2846 SmallVector<int, 16> NewMask;
2847 if (NumSrcElts >= NumDstElts) {
2848 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2849 // always be expanded to the equivalent form choosing narrower elements.
2850 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2851 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2852 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2853 } else {
2854 // The bitcast is from narrow elements to wide elements. The shuffle mask
2855 // must choose consecutive elements to allow casting first.
2856 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2857 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2858 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2859 return false;
2860 }
2861
2862 auto *NewShuffleDstTy =
2863 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2864
2865 // Try to replace a castop with a shuffle if the shuffle is not costly.
2866 InstructionCost CostC0 =
2867 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2869
2871 if (IsBinaryShuffle)
2873 else
2875
2876 InstructionCost OldCost = CostC0;
2877 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2878 CostKind, 0, nullptr, {}, &I);
2879
2880 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2881 CastSrcTy, NewMask, CostKind);
2882 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2884 if (!C0->hasOneUse())
2885 NewCost += CostC0;
2886 if (IsBinaryShuffle) {
2887 InstructionCost CostC1 =
2888 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2890 OldCost += CostC1;
2891 if (!C1->hasOneUse())
2892 NewCost += CostC1;
2893 }
2894
2895 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2896 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2897 << "\n");
2898 if (NewCost > OldCost)
2899 return false;
2900
2901 Value *Shuf;
2902 if (IsBinaryShuffle)
2903 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2904 NewMask);
2905 else
2906 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2907
2908 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2909
2910 // Intersect flags from the old casts.
2911 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2912 NewInst->copyIRFlags(C0);
2913 if (IsBinaryShuffle)
2914 NewInst->andIRFlags(C1);
2915 }
2916
2917 Worklist.pushValue(Shuf);
2918 replaceValue(I, *Cast);
2919 return true;
2920}
2921
2922/// Try to convert any of:
2923/// "shuffle (shuffle x, y), (shuffle y, x)"
2924/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2925/// "shuffle (shuffle x, undef), y"
2926/// "shuffle x, (shuffle y, undef)"
2927/// into "shuffle x, y".
2928bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2929 ArrayRef<int> OuterMask;
2930 Value *OuterV0, *OuterV1;
2931 if (!match(&I,
2932 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2933 return false;
2934
2935 ArrayRef<int> InnerMask0, InnerMask1;
2936 Value *X0, *X1, *Y0, *Y1;
2937 bool Match0 =
2938 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2939 bool Match1 =
2940 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2941 if (!Match0 && !Match1)
2942 return false;
2943
2944 // If the outer shuffle is a permute, then create a fake inner all-poison
2945 // shuffle. This is easier than accounting for length-changing shuffles below.
2946 SmallVector<int, 16> PoisonMask1;
2947 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2948 X1 = X0;
2949 Y1 = Y0;
2950 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2951 InnerMask1 = PoisonMask1;
2952 Match1 = true; // fake match
2953 }
2954
2955 X0 = Match0 ? X0 : OuterV0;
2956 Y0 = Match0 ? Y0 : OuterV0;
2957 X1 = Match1 ? X1 : OuterV1;
2958 Y1 = Match1 ? Y1 : OuterV1;
2959 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2960 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2961 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2962 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2963 X0->getType() != X1->getType())
2964 return false;
2965
2966 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2967 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2968
2969 // Attempt to merge shuffles, matching upto 2 source operands.
2970 // Replace index to a poison arg with PoisonMaskElem.
2971 // Bail if either inner masks reference an undef arg.
2972 SmallVector<int, 16> NewMask(OuterMask);
2973 Value *NewX = nullptr, *NewY = nullptr;
2974 for (int &M : NewMask) {
2975 Value *Src = nullptr;
2976 if (0 <= M && M < (int)NumImmElts) {
2977 Src = OuterV0;
2978 if (Match0) {
2979 M = InnerMask0[M];
2980 Src = M >= (int)NumSrcElts ? Y0 : X0;
2981 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2982 }
2983 } else if (M >= (int)NumImmElts) {
2984 Src = OuterV1;
2985 M -= NumImmElts;
2986 if (Match1) {
2987 M = InnerMask1[M];
2988 Src = M >= (int)NumSrcElts ? Y1 : X1;
2989 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2990 }
2991 }
2992 if (Src && M != PoisonMaskElem) {
2993 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2994 if (isa<UndefValue>(Src)) {
2995 // We've referenced an undef element - if its poison, update the shuffle
2996 // mask, else bail.
2997 if (!isa<PoisonValue>(Src))
2998 return false;
2999 M = PoisonMaskElem;
3000 continue;
3001 }
3002 if (!NewX || NewX == Src) {
3003 NewX = Src;
3004 continue;
3005 }
3006 if (!NewY || NewY == Src) {
3007 M += NumSrcElts;
3008 NewY = Src;
3009 continue;
3010 }
3011 return false;
3012 }
3013 }
3014
3015 if (!NewX)
3016 return PoisonValue::get(ShuffleDstTy);
3017 if (!NewY)
3018 NewY = PoisonValue::get(ShuffleSrcTy);
3019
3020 // Have we folded to an Identity shuffle?
3021 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3022 replaceValue(I, *NewX);
3023 return true;
3024 }
3025
3026 // Try to merge the shuffles if the new shuffle is not costly.
3027 InstructionCost InnerCost0 = 0;
3028 if (Match0)
3029 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3030
3031 InstructionCost InnerCost1 = 0;
3032 if (Match1)
3033 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3034
3036
3037 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3038
3039 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3043 InstructionCost NewCost =
3044 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3045 nullptr, {NewX, NewY});
3046 if (!OuterV0->hasOneUse())
3047 NewCost += InnerCost0;
3048 if (!OuterV1->hasOneUse())
3049 NewCost += InnerCost1;
3050
3051 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3052 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3053 << "\n");
3054 if (NewCost > OldCost)
3055 return false;
3056
3057 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3058 replaceValue(I, *Shuf);
3059 return true;
3060}
3061
3062/// Try to convert a chain of length-preserving shuffles that are fed by
3063/// length-changing shuffles from the same source, e.g. a chain of length 3:
3064///
3065/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3066/// (shuffle y, undef)),
3067// (shuffle y, undef)"
3068///
3069/// into a single shuffle fed by a length-changing shuffle:
3070///
3071/// "shuffle x, (shuffle y, undef)"
3072///
3073/// Such chains arise e.g. from folding extract/insert sequences.
3074bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3075 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3076 if (!TrunkType)
3077 return false;
3078
3079 unsigned ChainLength = 0;
3080 SmallVector<int> Mask;
3081 SmallVector<int> YMask;
3082 InstructionCost OldCost = 0;
3083 InstructionCost NewCost = 0;
3084 Value *Trunk = &I;
3085 unsigned NumTrunkElts = TrunkType->getNumElements();
3086 Value *Y = nullptr;
3087
3088 for (;;) {
3089 // Match the current trunk against (commutations of) the pattern
3090 // "shuffle trunk', (shuffle y, undef)"
3091 ArrayRef<int> OuterMask;
3092 Value *OuterV0, *OuterV1;
3093 if (ChainLength != 0 && !Trunk->hasOneUse())
3094 break;
3095 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3096 m_Mask(OuterMask))))
3097 break;
3098 if (OuterV0->getType() != TrunkType) {
3099 // This shuffle is not length-preserving, so it cannot be part of the
3100 // chain.
3101 break;
3102 }
3103
3104 ArrayRef<int> InnerMask0, InnerMask1;
3105 Value *A0, *A1, *B0, *B1;
3106 bool Match0 =
3107 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3108 bool Match1 =
3109 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3110 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3111 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3112 if (Match0Leaf == Match1Leaf) {
3113 // Only handle the case of exactly one leaf in each step. The "two leaves"
3114 // case is handled by foldShuffleOfShuffles.
3115 break;
3116 }
3117
3118 SmallVector<int> CommutedOuterMask;
3119 if (Match0Leaf) {
3120 std::swap(OuterV0, OuterV1);
3121 std::swap(InnerMask0, InnerMask1);
3122 std::swap(A0, A1);
3123 std::swap(B0, B1);
3124 llvm::append_range(CommutedOuterMask, OuterMask);
3125 for (int &M : CommutedOuterMask) {
3126 if (M == PoisonMaskElem)
3127 continue;
3128 if (M < (int)NumTrunkElts)
3129 M += NumTrunkElts;
3130 else
3131 M -= NumTrunkElts;
3132 }
3133 OuterMask = CommutedOuterMask;
3134 }
3135 if (!OuterV1->hasOneUse())
3136 break;
3137
3138 if (!isa<UndefValue>(A1)) {
3139 if (!Y)
3140 Y = A1;
3141 else if (Y != A1)
3142 break;
3143 }
3144 if (!isa<UndefValue>(B1)) {
3145 if (!Y)
3146 Y = B1;
3147 else if (Y != B1)
3148 break;
3149 }
3150
3151 auto *YType = cast<FixedVectorType>(A1->getType());
3152 int NumLeafElts = YType->getNumElements();
3153 SmallVector<int> LocalYMask(InnerMask1);
3154 for (int &M : LocalYMask) {
3155 if (M >= NumLeafElts)
3156 M -= NumLeafElts;
3157 }
3158
3159 InstructionCost LocalOldCost =
3162
3163 // Handle the initial (start of chain) case.
3164 if (!ChainLength) {
3165 Mask.assign(OuterMask);
3166 YMask.assign(LocalYMask);
3167 OldCost = NewCost = LocalOldCost;
3168 Trunk = OuterV0;
3169 ChainLength++;
3170 continue;
3171 }
3172
3173 // For the non-root case, first attempt to combine masks.
3174 SmallVector<int> NewYMask(YMask);
3175 bool Valid = true;
3176 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3177 if (LeafM == -1 || CombinedM == LeafM)
3178 continue;
3179 if (CombinedM == -1) {
3180 CombinedM = LeafM;
3181 } else {
3182 Valid = false;
3183 break;
3184 }
3185 }
3186 if (!Valid)
3187 break;
3188
3189 SmallVector<int> NewMask;
3190 NewMask.reserve(NumTrunkElts);
3191 for (int M : Mask) {
3192 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3193 NewMask.push_back(M);
3194 else
3195 NewMask.push_back(OuterMask[M]);
3196 }
3197
3198 // Break the chain if adding this new step complicates the shuffles such
3199 // that it would increase the new cost by more than the old cost of this
3200 // step.
3201 InstructionCost LocalNewCost =
3203 YType, NewYMask, CostKind) +
3205 TrunkType, NewMask, CostKind);
3206
3207 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3208 break;
3209
3210 LLVM_DEBUG({
3211 if (ChainLength == 1) {
3212 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3213 << I << '\n';
3214 }
3215 dbgs() << " next chain link: " << *Trunk << '\n'
3216 << " old cost: " << (OldCost + LocalOldCost)
3217 << " new cost: " << LocalNewCost << '\n';
3218 });
3219
3220 Mask = NewMask;
3221 YMask = NewYMask;
3222 OldCost += LocalOldCost;
3223 NewCost = LocalNewCost;
3224 Trunk = OuterV0;
3225 ChainLength++;
3226 }
3227 if (ChainLength <= 1)
3228 return false;
3229
3230 if (llvm::all_of(Mask, [&](int M) {
3231 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3232 })) {
3233 // Produce a canonical simplified form if all elements are sourced from Y.
3234 for (int &M : Mask) {
3235 if (M >= static_cast<int>(NumTrunkElts))
3236 M = YMask[M - NumTrunkElts];
3237 }
3238 Value *Root =
3239 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3240 replaceValue(I, *Root);
3241 return true;
3242 }
3243
3244 Value *Leaf =
3245 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3246 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3247 replaceValue(I, *Root);
3248 return true;
3249}
3250
3251/// Try to convert
3252/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3253bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3254 Value *V0, *V1;
3255 ArrayRef<int> OldMask;
3256 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3257 return false;
3258
3259 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3260 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3261 if (!II0 || !II1)
3262 return false;
3263
3264 Intrinsic::ID IID = II0->getIntrinsicID();
3265 if (IID != II1->getIntrinsicID())
3266 return false;
3267 InstructionCost CostII0 =
3268 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3269 InstructionCost CostII1 =
3270 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3271
3272 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3273 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3274 if (!ShuffleDstTy || !II0Ty)
3275 return false;
3276
3277 if (!isTriviallyVectorizable(IID))
3278 return false;
3279
3280 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3282 II0->getArgOperand(I) != II1->getArgOperand(I))
3283 return false;
3284
3285 InstructionCost OldCost =
3286 CostII0 + CostII1 +
3288 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3289
3290 SmallVector<Type *> NewArgsTy;
3291 InstructionCost NewCost = 0;
3292 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3293 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3295 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3296 } else {
3297 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3298 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3299 ShuffleDstTy->getNumElements());
3300 NewArgsTy.push_back(ArgTy);
3301 std::pair<Value *, Value *> OperandPair =
3302 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3303 if (!SeenOperandPairs.insert(OperandPair).second) {
3304 // We've already computed the cost for this operand pair.
3305 continue;
3306 }
3307 NewCost += TTI.getShuffleCost(
3308 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3309 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3310 }
3311 }
3312 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3313
3314 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3315 if (!II0->hasOneUse())
3316 NewCost += CostII0;
3317 if (II1 != II0 && !II1->hasOneUse())
3318 NewCost += CostII1;
3319
3320 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3321 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3322 << "\n");
3323
3324 if (NewCost > OldCost)
3325 return false;
3326
3327 SmallVector<Value *> NewArgs;
3328 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3329 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3331 NewArgs.push_back(II0->getArgOperand(I));
3332 } else {
3333 std::pair<Value *, Value *> OperandPair =
3334 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3335 auto It = ShuffleCache.find(OperandPair);
3336 if (It != ShuffleCache.end()) {
3337 // Reuse previously created shuffle for this operand pair.
3338 NewArgs.push_back(It->second);
3339 continue;
3340 }
3341 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3342 II1->getArgOperand(I), OldMask);
3343 ShuffleCache[OperandPair] = Shuf;
3344 NewArgs.push_back(Shuf);
3345 Worklist.pushValue(Shuf);
3346 }
3347 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3348
3349 // Intersect flags from the old intrinsics.
3350 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3351 NewInst->copyIRFlags(II0);
3352 NewInst->andIRFlags(II1);
3353 }
3354
3355 replaceValue(I, *NewIntrinsic);
3356 return true;
3357}
3358
3359/// Try to convert
3360/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3361bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3362 Value *V0;
3363 ArrayRef<int> Mask;
3364 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3365 return false;
3366
3367 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3368 if (!II0)
3369 return false;
3370
3371 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3372 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3373 if (!ShuffleDstTy || !IntrinsicSrcTy)
3374 return false;
3375
3376 // Validate it's a pure permute, mask should only reference the first vector
3377 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3378 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3379 return false;
3380
3381 Intrinsic::ID IID = II0->getIntrinsicID();
3382 if (!isTriviallyVectorizable(IID))
3383 return false;
3384
3385 // Cost analysis
3387 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3388 InstructionCost OldCost =
3391 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3392
3393 SmallVector<Type *> NewArgsTy;
3394 InstructionCost NewCost = 0;
3395 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3397 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3398 } else {
3399 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3400 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3401 ShuffleDstTy->getNumElements());
3402 NewArgsTy.push_back(ArgTy);
3404 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3405 {II0->getArgOperand(I)});
3406 }
3407 }
3408 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3409 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3410
3411 // If the intrinsic has multiple uses, we need to account for the cost of
3412 // keeping the original intrinsic around.
3413 if (!II0->hasOneUse())
3414 NewCost += IntrinsicCost;
3415
3416 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3417 << OldCost << " vs NewCost: " << NewCost << "\n");
3418
3419 if (NewCost > OldCost)
3420 return false;
3421
3422 // Transform
3423 SmallVector<Value *> NewArgs;
3424 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3426 NewArgs.push_back(II0->getArgOperand(I));
3427 } else {
3428 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3429 NewArgs.push_back(Shuf);
3430 Worklist.pushValue(Shuf);
3431 }
3432 }
3433
3434 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3435
3436 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3437 NewInst->copyIRFlags(II0);
3438
3439 replaceValue(I, *NewIntrinsic);
3440 return true;
3441}
3442
3443using InstLane = std::pair<Value *, int>;
3444
3445static InstLane lookThroughShuffles(Value *V, int Lane) {
3446 while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
3447 unsigned NumElts =
3448 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3449 int M = SV->getMaskValue(Lane);
3450 if (M < 0)
3451 return {nullptr, PoisonMaskElem};
3452 if (static_cast<unsigned>(M) < NumElts) {
3453 V = SV->getOperand(0);
3454 Lane = M;
3455 } else {
3456 V = SV->getOperand(1);
3457 Lane = M - NumElts;
3458 }
3459 }
3460 return InstLane{V, Lane};
3461}
3462
3466 for (InstLane IL : Item) {
3467 auto [U, Lane] = IL;
3468 InstLane OpLane =
3469 U ? lookThroughShuffles(cast<Instruction>(U)->getOperand(Op), Lane)
3470 : InstLane{nullptr, PoisonMaskElem};
3471 NItem.emplace_back(OpLane);
3472 }
3473 return NItem;
3474}
3475
3476/// Detect concat of multiple values into a vector
3478 const TargetTransformInfo &TTI) {
3479 auto *Ty = cast<FixedVectorType>(Item.front().first->getType());
3480 unsigned NumElts = Ty->getNumElements();
3481 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3482 return false;
3483
3484 // Check that the concat is free, usually meaning that the type will be split
3485 // during legalization.
3486 SmallVector<int, 16> ConcatMask(NumElts * 2);
3487 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3488 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3489 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3490 Ty, ConcatMask, CostKind) != 0)
3491 return false;
3492
3493 unsigned NumSlices = Item.size() / NumElts;
3494 // Currently we generate a tree of shuffles for the concats, which limits us
3495 // to a power2.
3496 if (!isPowerOf2_32(NumSlices))
3497 return false;
3498 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3499 Value *SliceV = Item[Slice * NumElts].first;
3500 if (!SliceV || SliceV->getType() != Ty)
3501 return false;
3502 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3503 auto [V, Lane] = Item[Slice * NumElts + Elt];
3504 if (Lane != static_cast<int>(Elt) || SliceV != V)
3505 return false;
3506 }
3507 }
3508 return true;
3509}
3510
3511static Value *
3513 const DenseSet<std::pair<Value *, Use *>> &IdentityLeafs,
3514 const DenseSet<std::pair<Value *, Use *>> &SplatLeafs,
3515 const DenseSet<std::pair<Value *, Use *>> &ConcatLeafs,
3516 IRBuilderBase &Builder, const TargetTransformInfo *TTI) {
3517 auto [FrontV, FrontLane] = Item.front();
3518
3519 if (IdentityLeafs.contains(std::make_pair(FrontV, From))) {
3520 return FrontV;
3521 }
3522 if (SplatLeafs.contains(std::make_pair(FrontV, From))) {
3523 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3524 return Builder.CreateShuffleVector(FrontV, Mask);
3525 }
3526 if (ConcatLeafs.contains(std::make_pair(FrontV, From))) {
3527 unsigned NumElts =
3528 cast<FixedVectorType>(FrontV->getType())->getNumElements();
3529 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3530 for (unsigned S = 0; S < Values.size(); ++S)
3531 Values[S] = Item[S * NumElts].first;
3532
3533 while (Values.size() > 1) {
3534 NumElts *= 2;
3535 SmallVector<int, 16> Mask(NumElts, 0);
3536 std::iota(Mask.begin(), Mask.end(), 0);
3537 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3538 for (unsigned S = 0; S < NewValues.size(); ++S)
3539 NewValues[S] =
3540 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3541 Values = NewValues;
3542 }
3543 return Values[0];
3544 }
3545
3546 auto *I = cast<Instruction>(FrontV);
3547 auto *II = dyn_cast<IntrinsicInst>(I);
3548 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3550 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3551 if (II &&
3552 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3553 Ops[Idx] = II->getOperand(Idx);
3554 continue;
3555 }
3557 &I->getOperandUse(Idx), Ty, IdentityLeafs,
3558 SplatLeafs, ConcatLeafs, Builder, TTI);
3559 }
3560
3561 SmallVector<Value *, 8> ValueList;
3562 for (const auto &Lane : Item)
3563 if (Lane.first)
3564 ValueList.push_back(Lane.first);
3565
3566 Type *DstTy =
3567 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3568 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3569 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3570 Ops[0], Ops[1]);
3571 propagateIRFlags(Value, ValueList);
3572 return Value;
3573 }
3574 if (auto *CI = dyn_cast<CmpInst>(I)) {
3575 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3576 propagateIRFlags(Value, ValueList);
3577 return Value;
3578 }
3579 if (auto *SI = dyn_cast<SelectInst>(I)) {
3580 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3581 propagateIRFlags(Value, ValueList);
3582 return Value;
3583 }
3584 if (auto *CI = dyn_cast<CastInst>(I)) {
3585 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3586 propagateIRFlags(Value, ValueList);
3587 return Value;
3588 }
3589 if (II) {
3590 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3591 propagateIRFlags(Value, ValueList);
3592 return Value;
3593 }
3594 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3595 auto *Value =
3596 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3597 propagateIRFlags(Value, ValueList);
3598 return Value;
3599}
3600
3601// Starting from a shuffle, look up through operands tracking the shuffled index
3602// of each lane. If we can simplify away the shuffles to identities then
3603// do so.
3604bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3605 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3606 if (!Ty || I.use_empty())
3607 return false;
3608
3609 SmallVector<InstLane> Start(Ty->getNumElements());
3610 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3611 Start[M] = lookThroughShuffles(&I, M);
3612
3614 Worklist.push_back(std::make_pair(Start, &*I.use_begin()));
3615 DenseSet<std::pair<Value *, Use *>> IdentityLeafs, SplatLeafs, ConcatLeafs;
3616 unsigned NumVisited = 0;
3617
3618 while (!Worklist.empty()) {
3619 if (++NumVisited > MaxInstrsToScan)
3620 return false;
3621
3622 auto ItemFrom = Worklist.pop_back_val();
3623 auto Item = ItemFrom.first;
3624 auto From = ItemFrom.second;
3625 auto [FrontV, FrontLane] = Item.front();
3626
3627 // If we found an undef first lane then bail out to keep things simple.
3628 if (!FrontV)
3629 return false;
3630
3631 // Helper to peek through bitcasts to the same value.
3632 auto IsEquiv = [&](Value *X, Value *Y) {
3633 return X->getType() == Y->getType() &&
3635 };
3636
3637 // Look for an identity value.
3638 if (FrontLane == 0 &&
3639 cast<FixedVectorType>(FrontV->getType())->getNumElements() ==
3640 Ty->getNumElements() &&
3641 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3642 Value *FrontV = Item.front().first;
3643 return !E.value().first || (IsEquiv(E.value().first, FrontV) &&
3644 E.value().second == (int)E.index());
3645 })) {
3646 IdentityLeafs.insert(std::make_pair(FrontV, From));
3647 continue;
3648 }
3649 // Look for constants, for the moment only supporting constant splats.
3650 if (auto *C = dyn_cast<Constant>(FrontV);
3651 C && C->getSplatValue() &&
3652 all_of(drop_begin(Item), [Item](InstLane &IL) {
3653 Value *FrontV = Item.front().first;
3654 Value *V = IL.first;
3655 return !V || (isa<Constant>(V) &&
3656 cast<Constant>(V)->getSplatValue() ==
3657 cast<Constant>(FrontV)->getSplatValue());
3658 })) {
3659 SplatLeafs.insert(std::make_pair(FrontV, From));
3660 continue;
3661 }
3662 // Look for a splat value.
3663 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3664 auto [FrontV, FrontLane] = Item.front();
3665 auto [V, Lane] = IL;
3666 return !V || (V == FrontV && Lane == FrontLane);
3667 })) {
3668 SplatLeafs.insert(std::make_pair(FrontV, From));
3669 continue;
3670 }
3671
3672 // We need each element to be the same type of value, and check that each
3673 // element has a single use.
3674 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3675 Value *FrontV = Item.front().first;
3676 if (!IL.first)
3677 return true;
3678 Value *V = IL.first;
3679 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3680 return false;
3681 if (V->getValueID() != FrontV->getValueID())
3682 return false;
3683 if (auto *CI = dyn_cast<CmpInst>(V))
3684 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3685 return false;
3686 if (auto *CI = dyn_cast<CastInst>(V))
3687 if (CI->getSrcTy()->getScalarType() !=
3688 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3689 return false;
3690 if (auto *SI = dyn_cast<SelectInst>(V))
3691 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3692 SI->getOperand(0)->getType() !=
3693 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3694 return false;
3695 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3696 return false;
3697 auto *II = dyn_cast<IntrinsicInst>(V);
3698 return !II || (isa<IntrinsicInst>(FrontV) &&
3699 II->getIntrinsicID() ==
3700 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3701 !II->hasOperandBundles());
3702 };
3703 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3704 // Check the operator is one that we support.
3705 if (isa<BinaryOperator, CmpInst>(FrontV)) {
3706 // We exclude div/rem in case they hit UB from poison lanes.
3707 if (auto *BO = dyn_cast<BinaryOperator>(FrontV);
3708 BO && BO->isIntDivRem())
3709 return false;
3711 &cast<Instruction>(FrontV)->getOperandUse(0));
3713 &cast<Instruction>(FrontV)->getOperandUse(1));
3714 continue;
3715 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3716 FPToUIInst, SIToFPInst, UIToFPInst>(FrontV)) {
3718 &cast<Instruction>(FrontV)->getOperandUse(0));
3719 continue;
3720 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontV)) {
3721 // TODO: Handle vector widening/narrowing bitcasts.
3722 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3723 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3724 if (DstTy && SrcTy &&
3725 SrcTy->getNumElements() == DstTy->getNumElements()) {
3727 &BitCast->getOperandUse(0));
3728 continue;
3729 }
3730 } else if (auto *Sel = dyn_cast<SelectInst>(FrontV)) {
3732 &Sel->getOperandUse(0));
3734 &Sel->getOperandUse(1));
3736 &Sel->getOperandUse(2));
3737 continue;
3738 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
3739 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3740 !II->hasOperandBundles()) {
3741 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3742 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3743 &TTI)) {
3744 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3745 Value *FrontV = Item.front().first;
3746 Value *V = IL.first;
3747 return !V || (cast<Instruction>(V)->getOperand(Op) ==
3748 cast<Instruction>(FrontV)->getOperand(Op));
3749 }))
3750 return false;
3751 continue;
3752 }
3754 &cast<Instruction>(FrontV)->getOperandUse(Op));
3755 }
3756 continue;
3757 }
3758 }
3759
3760 if (isFreeConcat(Item, CostKind, TTI)) {
3761 ConcatLeafs.insert(std::make_pair(FrontV, From));
3762 continue;
3763 }
3764
3765 return false;
3766 }
3767
3768 if (NumVisited <= 1)
3769 return false;
3770
3771 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3772
3773 // If we got this far, we know the shuffles are superfluous and can be
3774 // removed. Scan through again and generate the new tree of instructions.
3775 Builder.SetInsertPoint(&I);
3776 Value *V = generateNewInstTree(Start, &*I.use_begin(), Ty, IdentityLeafs,
3777 SplatLeafs, ConcatLeafs, Builder, &TTI);
3778 replaceValue(I, *V);
3779 return true;
3780}
3781
3782/// Given a commutative reduction, the order of the input lanes does not alter
3783/// the results. We can use this to remove certain shuffles feeding the
3784/// reduction, removing the need to shuffle at all.
3785bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3786 auto *II = dyn_cast<IntrinsicInst>(&I);
3787 if (!II)
3788 return false;
3789 switch (II->getIntrinsicID()) {
3790 case Intrinsic::vector_reduce_add:
3791 case Intrinsic::vector_reduce_mul:
3792 case Intrinsic::vector_reduce_and:
3793 case Intrinsic::vector_reduce_or:
3794 case Intrinsic::vector_reduce_xor:
3795 case Intrinsic::vector_reduce_smin:
3796 case Intrinsic::vector_reduce_smax:
3797 case Intrinsic::vector_reduce_umin:
3798 case Intrinsic::vector_reduce_umax:
3799 break;
3800 default:
3801 return false;
3802 }
3803
3804 // Find all the inputs when looking through operations that do not alter the
3805 // lane order (binops, for example). Currently we look for a single shuffle,
3806 // and can ignore splat values.
3807 std::queue<Value *> Worklist;
3808 SmallPtrSet<Value *, 4> Visited;
3809 ShuffleVectorInst *Shuffle = nullptr;
3810 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3811 Worklist.push(Op);
3812
3813 while (!Worklist.empty()) {
3814 Value *CV = Worklist.front();
3815 Worklist.pop();
3816 if (Visited.contains(CV))
3817 continue;
3818
3819 // Splats don't change the order, so can be safely ignored.
3820 if (isSplatValue(CV))
3821 continue;
3822
3823 Visited.insert(CV);
3824
3825 if (auto *CI = dyn_cast<Instruction>(CV)) {
3826 if (CI->isBinaryOp()) {
3827 for (auto *Op : CI->operand_values())
3828 Worklist.push(Op);
3829 continue;
3830 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3831 if (Shuffle && Shuffle != SV)
3832 return false;
3833 Shuffle = SV;
3834 continue;
3835 }
3836 }
3837
3838 // Anything else is currently an unknown node.
3839 return false;
3840 }
3841
3842 if (!Shuffle)
3843 return false;
3844
3845 // Check all uses of the binary ops and shuffles are also included in the
3846 // lane-invariant operations (Visited should be the list of lanewise
3847 // instructions, including the shuffle that we found).
3848 for (auto *V : Visited)
3849 for (auto *U : V->users())
3850 if (!Visited.contains(U) && U != &I)
3851 return false;
3852
3853 FixedVectorType *VecType =
3854 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3855 if (!VecType)
3856 return false;
3857 FixedVectorType *ShuffleInputType =
3859 if (!ShuffleInputType)
3860 return false;
3861 unsigned NumInputElts = ShuffleInputType->getNumElements();
3862
3863 // Find the mask from sorting the lanes into order. This is most likely to
3864 // become a identity or concat mask. Undef elements are pushed to the end.
3865 SmallVector<int> ConcatMask;
3866 Shuffle->getShuffleMask(ConcatMask);
3867 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3868 bool UsesSecondVec =
3869 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3870
3872 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3873 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3875 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3876 ShuffleInputType, ConcatMask, CostKind);
3877
3878 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3879 << "\n");
3880 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3881 << "\n");
3882 bool MadeChanges = false;
3883 if (NewCost < OldCost) {
3884 Builder.SetInsertPoint(Shuffle);
3885 Value *NewShuffle = Builder.CreateShuffleVector(
3886 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3887 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3888 replaceValue(*Shuffle, *NewShuffle);
3889 return true;
3890 }
3891
3892 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3893 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3894 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3895 return MadeChanges;
3896}
3897
3898/// For a given chain of patterns of the following form:
3899///
3900/// ```
3901/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3902///
3903/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3904/// ty1> %1)
3905/// OR
3906/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3907///
3908/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3909/// ...
3910/// ...
3911/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3912/// 3), <n x ty1> %(i - 2)
3913/// OR
3914/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3915///
3916/// %(i) = extractelement <n x ty1> %(i - 1), 0
3917/// ```
3918///
3919/// Where:
3920/// `mask` follows a partition pattern:
3921///
3922/// Ex:
3923/// [n = 8, p = poison]
3924///
3925/// 4 5 6 7 | p p p p
3926/// 2 3 | p p p p p p
3927/// 1 | p p p p p p p
3928///
3929/// For powers of 2, there's a consistent pattern, but for other cases
3930/// the parity of the current half value at each step decides the
3931/// next partition half (see `ExpectedParityMask` for more logical details
3932/// in generalising this).
3933///
3934/// Ex:
3935/// [n = 6]
3936///
3937/// 3 4 5 | p p p
3938/// 1 2 | p p p p
3939/// 1 | p p p p p
3940bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3941 // Going bottom-up for the pattern.
3942 std::queue<Value *> InstWorklist;
3943 InstructionCost OrigCost = 0;
3944
3945 // Common instruction operation after each shuffle op.
3946 std::optional<unsigned int> CommonCallOp = std::nullopt;
3947 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3948
3949 bool IsFirstCallOrBinInst = true;
3950 bool ShouldBeCallOrBinInst = true;
3951
3952 // This stores the last used instructions for shuffle/common op.
3953 //
3954 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3955 // instructions from either shuffle/common op.
3956 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3957
3958 Value *VecOpEE;
3959 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3960 return false;
3961
3962 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3963 if (!FVT)
3964 return false;
3965
3966 int64_t VecSize = FVT->getNumElements();
3967 if (VecSize < 2)
3968 return false;
3969
3970 // Number of levels would be ~log2(n), considering we always partition
3971 // by half for this fold pattern.
3972 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3973 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3974
3975 // This is how we generalise for all element sizes.
3976 // At each step, if vector size is odd, we need non-poison
3977 // values to cover the dominant half so we don't miss out on any element.
3978 //
3979 // This mask will help us retrieve this as we go from bottom to top:
3980 //
3981 // Mask Set -> N = N * 2 - 1
3982 // Mask Unset -> N = N * 2
3983 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3984 Cur = (Cur + 1) / 2, --Mask) {
3985 if (Cur & 1)
3986 ExpectedParityMask |= (1ll << Mask);
3987 }
3988
3989 InstWorklist.push(VecOpEE);
3990
3991 while (!InstWorklist.empty()) {
3992 Value *CI = InstWorklist.front();
3993 InstWorklist.pop();
3994
3995 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3996 if (!ShouldBeCallOrBinInst)
3997 return false;
3998
3999 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4000 return false;
4001
4002 // For the first found call/bin op, the vector has to come from the
4003 // extract element op.
4004 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4005 return false;
4006 IsFirstCallOrBinInst = false;
4007
4008 if (!CommonCallOp)
4009 CommonCallOp = II->getIntrinsicID();
4010 if (II->getIntrinsicID() != *CommonCallOp)
4011 return false;
4012
4013 switch (II->getIntrinsicID()) {
4014 case Intrinsic::umin:
4015 case Intrinsic::umax:
4016 case Intrinsic::smin:
4017 case Intrinsic::smax: {
4018 auto *Op0 = II->getOperand(0);
4019 auto *Op1 = II->getOperand(1);
4020 PrevVecV[0] = Op0;
4021 PrevVecV[1] = Op1;
4022 break;
4023 }
4024 default:
4025 return false;
4026 }
4027 ShouldBeCallOrBinInst ^= 1;
4028
4029 IntrinsicCostAttributes ICA(
4030 *CommonCallOp, II->getType(),
4031 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
4032 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4033
4034 // We may need a swap here since it can be (a, b) or (b, a)
4035 // and accordingly change as we go up.
4036 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4037 std::swap(PrevVecV[0], PrevVecV[1]);
4038 InstWorklist.push(PrevVecV[1]);
4039 InstWorklist.push(PrevVecV[0]);
4040 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
4041 // Similar logic for bin ops.
4042
4043 if (!ShouldBeCallOrBinInst)
4044 return false;
4045
4046 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4047 return false;
4048
4049 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4050 return false;
4051 IsFirstCallOrBinInst = false;
4052
4053 if (!CommonBinOp)
4054 CommonBinOp = BinOp->getOpcode();
4055
4056 if (BinOp->getOpcode() != *CommonBinOp)
4057 return false;
4058
4059 switch (*CommonBinOp) {
4060 case BinaryOperator::Add:
4061 case BinaryOperator::Mul:
4062 case BinaryOperator::Or:
4063 case BinaryOperator::And:
4064 case BinaryOperator::Xor: {
4065 auto *Op0 = BinOp->getOperand(0);
4066 auto *Op1 = BinOp->getOperand(1);
4067 PrevVecV[0] = Op0;
4068 PrevVecV[1] = Op1;
4069 break;
4070 }
4071 default:
4072 return false;
4073 }
4074 ShouldBeCallOrBinInst ^= 1;
4075
4076 OrigCost +=
4077 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
4078
4079 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4080 std::swap(PrevVecV[0], PrevVecV[1]);
4081 InstWorklist.push(PrevVecV[1]);
4082 InstWorklist.push(PrevVecV[0]);
4083 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
4084 // We shouldn't have any null values in the previous vectors,
4085 // is so, there was a mismatch in pattern.
4086 if (ShouldBeCallOrBinInst || any_of(PrevVecV, equal_to(nullptr)))
4087 return false;
4088
4089 if (SVInst != PrevVecV[1])
4090 return false;
4091
4092 ArrayRef<int> CurMask;
4093 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
4094 m_Mask(CurMask))))
4095 return false;
4096
4097 // Subtract the parity mask when checking the condition.
4098 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
4099 if (Mask < ShuffleMaskHalf &&
4100 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
4101 return false;
4102 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
4103 return false;
4104 }
4105
4106 // Update mask values.
4107 ShuffleMaskHalf *= 2;
4108 ShuffleMaskHalf -= (ExpectedParityMask & 1);
4109 ExpectedParityMask >>= 1;
4110
4112 SVInst->getType(), SVInst->getType(),
4113 CurMask, CostKind);
4114
4115 VisitedCnt += 1;
4116 if (!ExpectedParityMask && VisitedCnt == NumLevels)
4117 break;
4118
4119 ShouldBeCallOrBinInst ^= 1;
4120 } else {
4121 return false;
4122 }
4123 }
4124
4125 // Pattern should end with a shuffle op.
4126 if (ShouldBeCallOrBinInst)
4127 return false;
4128
4129 assert(VecSize != -1 && "Expected Match for Vector Size");
4130
4131 Value *FinalVecV = PrevVecV[0];
4132 if (!FinalVecV)
4133 return false;
4134
4135 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
4136
4137 Intrinsic::ID ReducedOp =
4138 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4139 : getReductionForBinop(*CommonBinOp));
4140 if (!ReducedOp)
4141 return false;
4142
4143 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
4145
4146 if (NewCost >= OrigCost)
4147 return false;
4148
4149 auto *ReducedResult =
4150 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
4151 replaceValue(I, *ReducedResult);
4152
4153 return true;
4154}
4155
4156/// Determine if its more efficient to fold:
4157/// reduce(trunc(x)) -> trunc(reduce(x)).
4158/// reduce(sext(x)) -> sext(reduce(x)).
4159/// reduce(zext(x)) -> zext(reduce(x)).
4160bool VectorCombine::foldCastFromReductions(Instruction &I) {
4161 auto *II = dyn_cast<IntrinsicInst>(&I);
4162 if (!II)
4163 return false;
4164
4165 bool TruncOnly = false;
4166 Intrinsic::ID IID = II->getIntrinsicID();
4167 switch (IID) {
4168 case Intrinsic::vector_reduce_add:
4169 case Intrinsic::vector_reduce_mul:
4170 TruncOnly = true;
4171 break;
4172 case Intrinsic::vector_reduce_and:
4173 case Intrinsic::vector_reduce_or:
4174 case Intrinsic::vector_reduce_xor:
4175 break;
4176 default:
4177 return false;
4178 }
4179
4180 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4181 Value *ReductionSrc = I.getOperand(0);
4182
4183 Value *Src;
4184 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4185 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4186 return false;
4187
4188 auto CastOpc =
4189 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4190
4191 auto *SrcTy = cast<VectorType>(Src->getType());
4192 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4193 Type *ResultTy = I.getType();
4194
4196 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4197 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4199 cast<CastInst>(ReductionSrc));
4200 InstructionCost NewCost =
4201 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4202 CostKind) +
4203 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4205
4206 if (OldCost <= NewCost || !NewCost.isValid())
4207 return false;
4208
4209 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4210 II->getIntrinsicID(), {Src});
4211 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4212 replaceValue(I, *NewCast);
4213 return true;
4214}
4215
4216/// Fold:
4217/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4218/// into:
4219/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4220///
4221/// Sign-bit reductions produce values with known semantics:
4222/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4223/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4224/// - reduce.add: count of negative elements (0 to NumElts)
4225///
4226/// Both lshr and ashr are supported:
4227/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4228/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4229///
4230/// The fold generalizes to multiple source vectors combined with the same
4231/// operation as the reduction. For example:
4232/// reduce.or(or(shr A, shr B)) conceptually extends the vector
4233/// For reduce.add, this changes the count to M*N where M is the number of
4234/// source vectors.
4235///
4236/// We transform to a direct sign check on the original vector using
4237/// reduce.{or,umax} or reduce.{and,umin}.
4238///
4239/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4240bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4241 CmpPredicate Pred;
4242 IntrinsicInst *ReduceOp;
4243 const APInt *CmpVal;
4244 if (!match(&I,
4245 m_ICmp(Pred, m_OneUse(m_AnyIntrinsic(ReduceOp)), m_APInt(CmpVal))))
4246 return false;
4247
4248 Intrinsic::ID OrigIID = ReduceOp->getIntrinsicID();
4249 switch (OrigIID) {
4250 case Intrinsic::vector_reduce_or:
4251 case Intrinsic::vector_reduce_umax:
4252 case Intrinsic::vector_reduce_and:
4253 case Intrinsic::vector_reduce_umin:
4254 case Intrinsic::vector_reduce_add:
4255 break;
4256 default:
4257 return false;
4258 }
4259
4260 Value *ReductionSrc = ReduceOp->getArgOperand(0);
4261 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4262 if (!VecTy)
4263 return false;
4264
4265 unsigned BitWidth = VecTy->getScalarSizeInBits();
4266 if (BitWidth == 1)
4267 return false;
4268
4269 unsigned NumElts = VecTy->getNumElements();
4270
4271 // Determine the expected tree opcode for multi-vector patterns.
4272 // The tree opcode must match the reduction's underlying operation.
4273 //
4274 // TODO: for pairs of equivalent operators, we should match both,
4275 // not only the most common.
4276 Instruction::BinaryOps TreeOpcode;
4277 switch (OrigIID) {
4278 case Intrinsic::vector_reduce_or:
4279 case Intrinsic::vector_reduce_umax:
4280 TreeOpcode = Instruction::Or;
4281 break;
4282 case Intrinsic::vector_reduce_and:
4283 case Intrinsic::vector_reduce_umin:
4284 TreeOpcode = Instruction::And;
4285 break;
4286 case Intrinsic::vector_reduce_add:
4287 TreeOpcode = Instruction::Add;
4288 break;
4289 default:
4290 llvm_unreachable("Unexpected intrinsic");
4291 }
4292
4293 // Collect sign-bit extraction leaves from an associative tree of TreeOpcode.
4294 // The tree conceptually extends the vector being reduced.
4295 SmallVector<Value *, 8> Worklist;
4296 SmallVector<Value *, 8> Sources; // Original vectors (X in shr X, BW-1)
4297 Worklist.push_back(ReductionSrc);
4298 std::optional<bool> IsAShr;
4299 constexpr unsigned MaxSources = 8;
4300
4301 // Calculate old cost: all shifts + tree ops + reduction
4302 InstructionCost OldCost = TTI.getInstructionCost(ReduceOp, CostKind);
4303
4304 while (!Worklist.empty() && Worklist.size() <= MaxSources &&
4305 Sources.size() <= MaxSources) {
4306 Value *V = Worklist.pop_back_val();
4307
4308 // Try to match sign-bit extraction: shr X, (bitwidth-1)
4309 Value *X;
4310 if (match(V, m_OneUse(m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))) {
4311 auto *Shr = cast<Instruction>(V);
4312
4313 // All shifts must be the same type (all lshr or all ashr)
4314 bool ThisIsAShr = Shr->getOpcode() == Instruction::AShr;
4315 if (!IsAShr)
4316 IsAShr = ThisIsAShr;
4317 else if (*IsAShr != ThisIsAShr)
4318 return false;
4319
4320 Sources.push_back(X);
4321
4322 // As part of the fold, we remove all of the shifts, so we need to keep
4323 // track of their costs.
4324 OldCost += TTI.getInstructionCost(Shr, CostKind);
4325
4326 continue;
4327 }
4328
4329 // Try to extend through a tree node of the expected opcode
4330 Value *A, *B;
4331 if (!match(V, m_OneUse(m_BinOp(TreeOpcode, m_Value(A), m_Value(B)))))
4332 return false;
4333
4334 // We are potentially replacing these operations as well, so we add them
4335 // to the costs.
4337
4338 Worklist.push_back(A);
4339 Worklist.push_back(B);
4340 }
4341
4342 // Must have at least one source and not exceed limit
4343 if (Sources.empty() || Sources.size() > MaxSources ||
4344 Worklist.size() > MaxSources || !IsAShr)
4345 return false;
4346
4347 unsigned NumSources = Sources.size();
4348
4349 // For reduce.add, the total count must fit as a signed integer.
4350 // Range is [0, M*N] for lshr or [-M*N, 0] for ashr.
4351 if (OrigIID == Intrinsic::vector_reduce_add &&
4352 !isIntN(BitWidth, NumSources * NumElts))
4353 return false;
4354
4355 // Compute the boundary value when all elements are negative:
4356 // - Per-element contribution: 1 for lshr, -1 for ashr
4357 // - For add: M*N (total elements across all sources); for others: just 1
4358 unsigned Count =
4359 (OrigIID == Intrinsic::vector_reduce_add) ? NumSources * NumElts : 1;
4360 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4361 if (*IsAShr)
4362 NegativeVal.negate();
4363
4364 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4365 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4366 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4367 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4368
4369 // Determine comparison semantics:
4370 // - IsEq: true for equality test, false for inequality
4371 // - TestsNegative: true if testing against AllNegVal, false for zero
4372 //
4373 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4374 // that fold to boundary tests given the narrow value range:
4375 // < RangeHigh -> != RangeHigh
4376 // > RangeHigh-1 -> == RangeHigh
4377 // > RangeLow -> != RangeLow
4378 // < RangeLow+1 -> == RangeLow
4379 //
4380 // For inequalities, we work with signed predicates only. Unsigned predicates
4381 // are canonicalized to signed when the range is non-negative (where they are
4382 // equivalent). When the range includes negative values, unsigned predicates
4383 // would have different semantics due to wrap-around, so we reject them.
4384 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4385 if (RangeLow.isNegative())
4386 return false;
4387 Pred = ICmpInst::getSignedPredicate(Pred);
4388 }
4389
4390 bool IsEq;
4391 bool TestsNegative;
4392 if (ICmpInst::isEquality(Pred)) {
4393 if (CmpVal->isZero()) {
4394 TestsNegative = false;
4395 } else if (*CmpVal == NegativeVal) {
4396 TestsNegative = true;
4397 } else {
4398 return false;
4399 }
4400 IsEq = Pred == ICmpInst::ICMP_EQ;
4401 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4402 IsEq = false;
4403 TestsNegative = (RangeHigh == NegativeVal);
4404 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4405 IsEq = true;
4406 TestsNegative = (RangeHigh == NegativeVal);
4407 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4408 IsEq = false;
4409 TestsNegative = (RangeLow == NegativeVal);
4410 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4411 IsEq = true;
4412 TestsNegative = (RangeLow == NegativeVal);
4413 } else {
4414 return false;
4415 }
4416
4417 // For this fold we support four types of checks:
4418 //
4419 // 1. All lanes are negative - AllNeg
4420 // 2. All lanes are non-negative - AllNonNeg
4421 // 3. At least one negative lane - AnyNeg
4422 // 4. At least one non-negative lane - AnyNonNeg
4423 //
4424 // For each case, we can generate the following code:
4425 //
4426 // 1. AllNeg - reduce.and/umin(X) < 0
4427 // 2. AllNonNeg - reduce.or/umax(X) > -1
4428 // 3. AnyNeg - reduce.or/umax(X) < 0
4429 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4430 //
4431 // The table below shows the aggregation of all supported cases
4432 // using these four cases.
4433 //
4434 // Reduction | == 0 | != 0 | == MAX | != MAX
4435 // ------------+-----------+-----------+-----------+-----------
4436 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4437 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4438 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4439 //
4440 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4441 //
4442 // For easier codegen and check inversion, we use the following encoding:
4443 //
4444 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4445 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4446 // 3. Bit-1 === universal (1) or existential (0) check
4447 //
4448 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4449 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4450 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4451 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4452 //
4453 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4454 //
4455 enum CheckKind : unsigned {
4456 AnyNonNeg = 0b000,
4457 AllNeg = 0b011,
4458 AllNonNeg = 0b101,
4459 AnyNeg = 0b110,
4460 };
4461 // Return true if we fold this check into or/umax and false for and/umin
4462 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4463 // Return true if we should check if result is negative and false otherwise
4464 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4465 // Logically invert the check
4466 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4467
4468 CheckKind Base;
4469 switch (OrigIID) {
4470 case Intrinsic::vector_reduce_or:
4471 case Intrinsic::vector_reduce_umax:
4472 Base = TestsNegative ? AnyNeg : AllNonNeg;
4473 break;
4474 case Intrinsic::vector_reduce_and:
4475 case Intrinsic::vector_reduce_umin:
4476 Base = TestsNegative ? AllNeg : AnyNonNeg;
4477 break;
4478 case Intrinsic::vector_reduce_add:
4479 Base = TestsNegative ? AllNeg : AllNonNeg;
4480 break;
4481 default:
4482 llvm_unreachable("Unexpected intrinsic");
4483 }
4484
4485 CheckKind Check = IsEq ? Base : Invert(Base);
4486
4487 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4488 InstructionCost ArithCost =
4490 VecTy, std::nullopt, CostKind);
4491 InstructionCost MinMaxCost =
4493 FastMathFlags(), CostKind);
4494 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4495 : std::make_pair(MinMax, MinMaxCost);
4496 };
4497
4498 // Choose output reduction based on encoding's MSB
4499 auto [NewIID, NewCost] = RequiresOr(Check)
4500 ? PickCheaper(Intrinsic::vector_reduce_or,
4501 Intrinsic::vector_reduce_umax)
4502 : PickCheaper(Intrinsic::vector_reduce_and,
4503 Intrinsic::vector_reduce_umin);
4504
4505 // Add cost of combining multiple sources with or/and
4506 if (NumSources > 1) {
4507 unsigned CombineOpc =
4508 RequiresOr(Check) ? Instruction::Or : Instruction::And;
4509 NewCost += TTI.getArithmeticInstrCost(CombineOpc, VecTy, CostKind) *
4510 (NumSources - 1);
4511 }
4512
4513 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4514 << OldCost << " vs NewCost: " << NewCost << "\n");
4515
4516 if (NewCost > OldCost)
4517 return false;
4518
4519 // Generate the combined input and reduction
4520 Builder.SetInsertPoint(&I);
4521 Type *ScalarTy = VecTy->getScalarType();
4522
4523 Value *Input;
4524 if (NumSources == 1) {
4525 Input = Sources[0];
4526 } else {
4527 // Combine sources with or/and based on check type
4528 Input = RequiresOr(Check) ? Builder.CreateOr(Sources)
4529 : Builder.CreateAnd(Sources);
4530 }
4531
4532 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {Input});
4533 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4534 : Builder.CreateIsNotNeg(NewReduce);
4535 replaceValue(I, *NewCmp);
4536 return true;
4537}
4538
4539/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4540///
4541/// We can prove it for cases when:
4542///
4543/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4544/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4545/// 2. f(x) == 0 <=> x == 0
4546///
4547/// From 1 and 2 (or 1' and 2), we can infer that
4548///
4549/// OP f(X_i) == 0 <=> OP X_i == 0.
4550///
4551/// (1)
4552/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4553/// (2)
4554/// <=> \forall i \in [1, N] X_i == 0
4555/// (1)
4556/// <=> OP(X_i) == 0
4557///
4558/// For some of the OP's and f's, we need to have domain constraints on X
4559/// to ensure properties 1 (or 1') and 2.
4560bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4561 CmpPredicate Pred;
4562 Value *Op;
4563 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4564 !ICmpInst::isEquality(Pred))
4565 return false;
4566
4567 auto *II = dyn_cast<IntrinsicInst>(Op);
4568 if (!II)
4569 return false;
4570
4571 switch (II->getIntrinsicID()) {
4572 case Intrinsic::vector_reduce_add:
4573 case Intrinsic::vector_reduce_or:
4574 case Intrinsic::vector_reduce_umin:
4575 case Intrinsic::vector_reduce_umax:
4576 case Intrinsic::vector_reduce_smin:
4577 case Intrinsic::vector_reduce_smax:
4578 break;
4579 default:
4580 return false;
4581 }
4582
4583 Value *InnerOp = II->getArgOperand(0);
4584
4585 // TODO: fixed vector type might be too restrictive
4586 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4587 return false;
4588
4589 Value *X = nullptr;
4590
4591 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4592 //
4593 // 1. f(x) = shl nuw x, y for arbitrary y
4594 // 2. f(x) = mul nuw x, c for defined c != 0
4595 // 3. f(x) = zext x
4596 // 4. f(x) = sext x
4597 // 5. f(x) = neg x
4598 //
4599 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4600 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4601 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4602 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4603 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4604 ))
4605 return false;
4606
4607 SimplifyQuery S = SQ.getWithInstruction(&I);
4608 auto *XTy = cast<FixedVectorType>(X->getType());
4609
4610 // Check for domain constraints for all supported reductions.
4611 //
4612 // a. OR X_i - has property 1 for every X
4613 // b. UMAX X_i - has property 1 for every X
4614 // c. UMIN X_i - has property 1' for every X
4615 // d. SMAX X_i - has property 1 for X >= 0
4616 // e. SMIN X_i - has property 1' for X >= 0
4617 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4618 //
4619 // In order for the proof to work, we need 1 (or 1') to be true for both
4620 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4621 //
4622 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4623 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4624 // of known bits, we can't reasonably hold knowledge of "either 0
4625 // or negative".
4626 switch (II->getIntrinsicID()) {
4627 case Intrinsic::vector_reduce_add: {
4628 // We need to check that both X_i and f(X_i) have enough leading
4629 // zeros to not overflow.
4630 KnownBits KnownX = computeKnownBits(X, S);
4631 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4632 unsigned NumElems = XTy->getNumElements();
4633 // Adding N elements loses at most ceil(log2(N)) leading bits.
4634 unsigned LostBits = Log2_32_Ceil(NumElems);
4635 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4636 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4637 // Need at least one leading zero left after summation to ensure no overflow
4638 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4639 return false;
4640
4641 // We are not checking whether X or f(X) are positive explicitly because
4642 // we implicitly checked for it when we checked if both cases have enough
4643 // leading zeros to not wrap addition.
4644 break;
4645 }
4646 case Intrinsic::vector_reduce_smin:
4647 case Intrinsic::vector_reduce_smax:
4648 // Check whether X >= 0 and f(X) >= 0
4649 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4650 return false;
4651
4652 break;
4653 default:
4654 break;
4655 };
4656
4657 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4658 << *II << "\n");
4659
4660 // For zext/sext, check if the transform is profitable using cost model.
4661 // For other operations (shl, mul, neg), we're removing an instruction
4662 // while keeping the same reduction type, so it's always profitable.
4663 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4664 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4665 Intrinsic::ID IID = II->getIntrinsicID();
4666
4668 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4670
4671 InstructionCost OldReduceCost, NewReduceCost;
4672 switch (IID) {
4673 case Intrinsic::vector_reduce_add:
4674 case Intrinsic::vector_reduce_or:
4675 OldReduceCost = TTI.getArithmeticReductionCost(
4676 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4677 NewReduceCost = TTI.getArithmeticReductionCost(
4678 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4679 break;
4680 case Intrinsic::vector_reduce_umin:
4681 case Intrinsic::vector_reduce_umax:
4682 case Intrinsic::vector_reduce_smin:
4683 case Intrinsic::vector_reduce_smax:
4684 OldReduceCost = TTI.getMinMaxReductionCost(
4685 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4686 NewReduceCost = TTI.getMinMaxReductionCost(
4687 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4688 break;
4689 default:
4690 llvm_unreachable("Unexpected reduction");
4691 }
4692
4693 InstructionCost OldCost = OldReduceCost + ExtCost;
4694 InstructionCost NewCost =
4695 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4696
4697 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4698 << *InnerOp << "\n OldCost: " << OldCost
4699 << " vs NewCost: " << NewCost << "\n");
4700
4701 // We consider transformation to still be potentially beneficial even
4702 // when the costs are the same because we might remove a use from f(X)
4703 // and unlock other optimizations. Equal costs would just mean that we
4704 // didn't make it worse in the worst case.
4705 if (NewCost > OldCost)
4706 return false;
4707 }
4708
4709 // Since we support zext and sext as f, we might change the scalar type
4710 // of the intrinsic.
4711 Type *Ty = XTy->getScalarType();
4712 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4713 Value *NewCmp =
4714 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4715 replaceValue(I, *NewCmp);
4716 return true;
4717}
4718
4719/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4720/// based on cost, preserving the comparison semantics.
4721///
4722/// We use two fundamental properties for each pair:
4723///
4724/// 1. or(X) == 0 <=> umax(X) == 0
4725/// 2. or(X) == 1 <=> umax(X) == 1
4726/// 3. sign(or(X)) == sign(umax(X))
4727///
4728/// 1. and(X) == -1 <=> umin(X) == -1
4729/// 2. and(X) == -2 <=> umin(X) == -2
4730/// 3. sign(and(X)) == sign(umin(X))
4731///
4732/// From these we can infer the following transformations:
4733/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4734/// b. or(X) s< 0 <-> umax(X) s< 0
4735/// c. or(X) s> -1 <-> umax(X) s> -1
4736/// d. or(X) s< 1 <-> umax(X) s< 1
4737/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4738/// f. or(X) s< 2 <-> umax(X) s< 2
4739/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4740/// h. and(X) s< 0 <-> umin(X) s< 0
4741/// i. and(X) s> -1 <-> umin(X) s> -1
4742/// j. and(X) s> -2 <-> umin(X) s> -2
4743/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4744/// l. and(X) s> -3 <-> umin(X) s> -3
4745///
4746bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4747 CmpPredicate Pred;
4748 Value *ReduceOp;
4749 const APInt *CmpVal;
4750 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4751 return false;
4752
4753 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4754 if (!II || !II->hasOneUse())
4755 return false;
4756
4757 const auto IsValidOrUmaxCmp = [&]() {
4758 // or === umax for i1
4759 if (CmpVal->getBitWidth() == 1)
4760 return true;
4761
4762 // Cases a and e
4763 bool IsEquality =
4764 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4765 // Case c
4766 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4767 // Cases b, d, and f
4768 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4769 Pred == ICmpInst::ICMP_SLT;
4770 return IsEquality || IsPositive || IsNegative;
4771 };
4772
4773 const auto IsValidAndUminCmp = [&]() {
4774 // and === umin for i1
4775 if (CmpVal->getBitWidth() == 1)
4776 return true;
4777
4778 const auto LeadingOnes = CmpVal->countl_one();
4779
4780 // Cases g and k
4781 bool IsEquality =
4782 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4784 // Case h
4785 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4786 // Cases i, j, and l
4787 bool IsPositive =
4788 // if the number has at least N - 2 leading ones
4789 // and the two LSBs are:
4790 // - 1 x 1 -> -1
4791 // - 1 x 0 -> -2
4792 // - 0 x 1 -> -3
4793 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4794 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4795 return IsEquality || IsNegative || IsPositive;
4796 };
4797
4798 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4799 Intrinsic::ID AlternativeIID;
4800
4801 // Check if this is a valid comparison pattern and determine the alternate
4802 // reduction intrinsic.
4803 switch (OriginalIID) {
4804 case Intrinsic::vector_reduce_or:
4805 if (!IsValidOrUmaxCmp())
4806 return false;
4807 AlternativeIID = Intrinsic::vector_reduce_umax;
4808 break;
4809 case Intrinsic::vector_reduce_umax:
4810 if (!IsValidOrUmaxCmp())
4811 return false;
4812 AlternativeIID = Intrinsic::vector_reduce_or;
4813 break;
4814 case Intrinsic::vector_reduce_and:
4815 if (!IsValidAndUminCmp())
4816 return false;
4817 AlternativeIID = Intrinsic::vector_reduce_umin;
4818 break;
4819 case Intrinsic::vector_reduce_umin:
4820 if (!IsValidAndUminCmp())
4821 return false;
4822 AlternativeIID = Intrinsic::vector_reduce_and;
4823 break;
4824 default:
4825 return false;
4826 }
4827
4828 Value *X = II->getArgOperand(0);
4829 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4830 if (!VecTy)
4831 return false;
4832
4833 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4834 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4835 if (ReductionOpc != Instruction::ICmp)
4836 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4837 CostKind);
4839 FastMathFlags(), CostKind);
4840 };
4841
4842 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4843 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4844
4845 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4846 << "\n OrigCost: " << OrigCost
4847 << " vs AltCost: " << AltCost << "\n");
4848
4849 if (AltCost >= OrigCost)
4850 return false;
4851
4852 Builder.SetInsertPoint(&I);
4853 Type *ScalarTy = VecTy->getScalarType();
4854 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
4855 Value *NewCmp =
4856 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
4857
4858 replaceValue(I, *NewCmp);
4859 return true;
4860}
4861
4862/// Returns true if this ShuffleVectorInst eventually feeds into a
4863/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4864/// chains of shuffles and binary operators (in any combination/order).
4865/// The search does not go deeper than the given Depth.
4867 constexpr unsigned MaxVisited = 32;
4870 bool FoundReduction = false;
4871
4872 WorkList.push_back(SVI);
4873 while (!WorkList.empty()) {
4874 Instruction *I = WorkList.pop_back_val();
4875 for (User *U : I->users()) {
4876 auto *UI = cast<Instruction>(U);
4877 if (!UI || !Visited.insert(UI).second)
4878 continue;
4879 if (Visited.size() > MaxVisited)
4880 return false;
4881 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4882 // More than one reduction reached
4883 if (FoundReduction)
4884 return false;
4885 switch (II->getIntrinsicID()) {
4886 case Intrinsic::vector_reduce_add:
4887 case Intrinsic::vector_reduce_mul:
4888 case Intrinsic::vector_reduce_and:
4889 case Intrinsic::vector_reduce_or:
4890 case Intrinsic::vector_reduce_xor:
4891 case Intrinsic::vector_reduce_smin:
4892 case Intrinsic::vector_reduce_smax:
4893 case Intrinsic::vector_reduce_umin:
4894 case Intrinsic::vector_reduce_umax:
4895 FoundReduction = true;
4896 continue;
4897 default:
4898 return false;
4899 }
4900 }
4901
4903 return false;
4904
4905 WorkList.emplace_back(UI);
4906 }
4907 }
4908 return FoundReduction;
4909}
4910
4911/// This method looks for groups of shuffles acting on binops, of the form:
4912/// %x = shuffle ...
4913/// %y = shuffle ...
4914/// %a = binop %x, %y
4915/// %b = binop %x, %y
4916/// shuffle %a, %b, selectmask
4917/// We may, especially if the shuffle is wider than legal, be able to convert
4918/// the shuffle to a form where only parts of a and b need to be computed. On
4919/// architectures with no obvious "select" shuffle, this can reduce the total
4920/// number of operations if the target reports them as cheaper.
4921bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4922 auto *SVI = cast<ShuffleVectorInst>(&I);
4923 auto *VT = cast<FixedVectorType>(I.getType());
4924 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4925 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4926 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4927 VT != Op0->getType())
4928 return false;
4929
4930 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4931 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4932 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4933 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4934 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4935 auto checkSVNonOpUses = [&](Instruction *I) {
4936 if (!I || I->getOperand(0)->getType() != VT)
4937 return true;
4938 return any_of(I->users(), [&](User *U) {
4939 return U != Op0 && U != Op1 &&
4940 !(isa<ShuffleVectorInst>(U) &&
4941 (InputShuffles.contains(cast<Instruction>(U)) ||
4942 isInstructionTriviallyDead(cast<Instruction>(U))));
4943 });
4944 };
4945 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4946 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4947 return false;
4948
4949 // Collect all the uses that are shuffles that we can transform together. We
4950 // may not have a single shuffle, but a group that can all be transformed
4951 // together profitably.
4953 auto collectShuffles = [&](Instruction *I) {
4954 for (auto *U : I->users()) {
4955 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4956 if (!SV || SV->getType() != VT)
4957 return false;
4958 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4959 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4960 return false;
4961 if (!llvm::is_contained(Shuffles, SV))
4962 Shuffles.push_back(SV);
4963 }
4964 return true;
4965 };
4966 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4967 return false;
4968 // From a reduction, we need to be processing a single shuffle, otherwise the
4969 // other uses will not be lane-invariant.
4970 if (FromReduction && Shuffles.size() > 1)
4971 return false;
4972
4973 // Add any shuffle uses for the shuffles we have found, to include them in our
4974 // cost calculations.
4975 if (!FromReduction) {
4976 for (ShuffleVectorInst *SV : Shuffles) {
4977 for (auto *U : SV->users()) {
4978 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4979 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4980 Shuffles.push_back(SSV);
4981 }
4982 }
4983 }
4984
4985 // For each of the output shuffles, we try to sort all the first vector
4986 // elements to the beginning, followed by the second array elements at the
4987 // end. If the binops are legalized to smaller vectors, this may reduce total
4988 // number of binops. We compute the ReconstructMask mask needed to convert
4989 // back to the original lane order.
4991 SmallVector<SmallVector<int>> OrigReconstructMasks;
4992 int MaxV1Elt = 0, MaxV2Elt = 0;
4993 unsigned NumElts = VT->getNumElements();
4994 for (ShuffleVectorInst *SVN : Shuffles) {
4995 SmallVector<int> Mask;
4996 SVN->getShuffleMask(Mask);
4997
4998 // Check the operands are the same as the original, or reversed (in which
4999 // case we need to commute the mask).
5000 Value *SVOp0 = SVN->getOperand(0);
5001 Value *SVOp1 = SVN->getOperand(1);
5002 if (isa<UndefValue>(SVOp1)) {
5003 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
5004 SVOp0 = SSV->getOperand(0);
5005 SVOp1 = SSV->getOperand(1);
5006 for (int &Elem : Mask) {
5007 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
5008 return false;
5009 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
5010 }
5011 }
5012 if (SVOp0 == Op1 && SVOp1 == Op0) {
5013 std::swap(SVOp0, SVOp1);
5015 }
5016 if (SVOp0 != Op0 || SVOp1 != Op1)
5017 return false;
5018
5019 // Calculate the reconstruction mask for this shuffle, as the mask needed to
5020 // take the packed values from Op0/Op1 and reconstructing to the original
5021 // order.
5022 SmallVector<int> ReconstructMask;
5023 for (unsigned I = 0; I < Mask.size(); I++) {
5024 if (Mask[I] < 0) {
5025 ReconstructMask.push_back(-1);
5026 } else if (Mask[I] < static_cast<int>(NumElts)) {
5027 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
5028 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
5029 return Mask[I] == A.first;
5030 });
5031 if (It != V1.end())
5032 ReconstructMask.push_back(It - V1.begin());
5033 else {
5034 ReconstructMask.push_back(V1.size());
5035 V1.emplace_back(Mask[I], V1.size());
5036 }
5037 } else {
5038 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
5039 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
5040 return Mask[I] - static_cast<int>(NumElts) == A.first;
5041 });
5042 if (It != V2.end())
5043 ReconstructMask.push_back(NumElts + It - V2.begin());
5044 else {
5045 ReconstructMask.push_back(NumElts + V2.size());
5046 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
5047 }
5048 }
5049 }
5050
5051 // For reductions, we know that the lane ordering out doesn't alter the
5052 // result. In-order can help simplify the shuffle away.
5053 if (FromReduction)
5054 sort(ReconstructMask);
5055 OrigReconstructMasks.push_back(std::move(ReconstructMask));
5056 }
5057
5058 // If the Maximum element used from V1 and V2 are not larger than the new
5059 // vectors, the vectors are already packes and performing the optimization
5060 // again will likely not help any further. This also prevents us from getting
5061 // stuck in a cycle in case the costs do not also rule it out.
5062 if (V1.empty() || V2.empty() ||
5063 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
5064 MaxV2Elt == static_cast<int>(V2.size()) - 1))
5065 return false;
5066
5067 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
5068 // shuffle of another shuffle, or not a shuffle (that is treated like a
5069 // identity shuffle).
5070 auto GetBaseMaskValue = [&](Instruction *I, int M) {
5071 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5072 if (!SV)
5073 return M;
5074 if (isa<UndefValue>(SV->getOperand(1)))
5075 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5076 if (InputShuffles.contains(SSV))
5077 return SSV->getMaskValue(SV->getMaskValue(M));
5078 return SV->getMaskValue(M);
5079 };
5080
5081 // Attempt to sort the inputs my ascending mask values to make simpler input
5082 // shuffles and push complex shuffles down to the uses. We sort on the first
5083 // of the two input shuffle orders, to try and get at least one input into a
5084 // nice order.
5085 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
5086 std::pair<int, int> Y) {
5087 int MXA = GetBaseMaskValue(A, X.first);
5088 int MYA = GetBaseMaskValue(A, Y.first);
5089 return MXA < MYA;
5090 };
5091 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
5092 return SortBase(SVI0A, A, B);
5093 });
5094 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
5095 return SortBase(SVI1A, A, B);
5096 });
5097 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5098 // modified order of the input shuffles.
5099 SmallVector<SmallVector<int>> ReconstructMasks;
5100 for (const auto &Mask : OrigReconstructMasks) {
5101 SmallVector<int> ReconstructMask;
5102 for (int M : Mask) {
5103 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5104 auto It = find_if(V, [M](auto A) { return A.second == M; });
5105 assert(It != V.end() && "Expected all entries in Mask");
5106 return std::distance(V.begin(), It);
5107 };
5108 if (M < 0)
5109 ReconstructMask.push_back(-1);
5110 else if (M < static_cast<int>(NumElts)) {
5111 ReconstructMask.push_back(FindIndex(V1, M));
5112 } else {
5113 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5114 }
5115 }
5116 ReconstructMasks.push_back(std::move(ReconstructMask));
5117 }
5118
5119 // Calculate the masks needed for the new input shuffles, which get padded
5120 // with undef
5121 SmallVector<int> V1A, V1B, V2A, V2B;
5122 for (unsigned I = 0; I < V1.size(); I++) {
5123 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5124 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5125 }
5126 for (unsigned I = 0; I < V2.size(); I++) {
5127 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5128 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5129 }
5130 while (V1A.size() < NumElts) {
5133 }
5134 while (V2A.size() < NumElts) {
5137 }
5138
5139 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5140 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5141 if (!SV)
5142 return C;
5143 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5146 VT, VT, SV->getShuffleMask(), CostKind);
5147 };
5148 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5149 return C +
5151 };
5152
5153 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5154 unsigned MaxVectorSize =
5156 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5157 if (MaxElementsInVector == 0)
5158 return false;
5159 // When there are multiple shufflevector operations on the same input,
5160 // especially when the vector length is larger than the register size,
5161 // identical shuffle patterns may occur across different groups of elements.
5162 // To avoid overestimating the cost by counting these repeated shuffles more
5163 // than once, we only account for unique shuffle patterns. This adjustment
5164 // prevents inflated costs in the cost model for wide vectors split into
5165 // several register-sized groups.
5166 std::set<SmallVector<int, 4>> UniqueShuffles;
5167 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5168 // Compute the cost for performing the shuffle over the full vector.
5169 auto ShuffleCost =
5171 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5172 if (NumFullVectors < 2)
5173 return C + ShuffleCost;
5174 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5175 unsigned NumUniqueGroups = 0;
5176 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5177 // For each group of MaxElementsInVector contiguous elements,
5178 // collect their shuffle pattern and insert into the set of unique patterns.
5179 for (unsigned I = 0; I < NumFullVectors; ++I) {
5180 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5181 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5182 if (UniqueShuffles.insert(SubShuffle).second)
5183 NumUniqueGroups += 1;
5184 }
5185 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5186 };
5187 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5188 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5189 if (!SV)
5190 return C;
5191 SmallVector<int, 16> Mask;
5192 SV->getShuffleMask(Mask);
5193 return AddShuffleMaskAdjustedCost(C, Mask);
5194 };
5195 // Check that input consists of ShuffleVectors applied to the same input
5196 auto AllShufflesHaveSameOperands =
5197 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5198 if (InputShuffles.size() < 2)
5199 return false;
5200 ShuffleVectorInst *FirstSV =
5201 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5202 if (!FirstSV)
5203 return false;
5204
5205 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5206 return std::all_of(
5207 std::next(InputShuffles.begin()), InputShuffles.end(),
5208 [&](Instruction *I) {
5209 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5210 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5211 });
5212 };
5213
5214 // Get the costs of the shuffles + binops before and after with the new
5215 // shuffle masks.
5216 InstructionCost CostBefore =
5217 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5218 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5219 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5220 InstructionCost(0), AddShuffleCost);
5221 if (AllShufflesHaveSameOperands(InputShuffles)) {
5222 UniqueShuffles.clear();
5223 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5224 InstructionCost(0), AddShuffleAdjustedCost);
5225 } else {
5226 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5227 InstructionCost(0), AddShuffleCost);
5228 }
5229
5230 // The new binops will be unused for lanes past the used shuffle lengths.
5231 // These types attempt to get the correct cost for that from the target.
5232 FixedVectorType *Op0SmallVT =
5233 FixedVectorType::get(VT->getScalarType(), V1.size());
5234 FixedVectorType *Op1SmallVT =
5235 FixedVectorType::get(VT->getScalarType(), V2.size());
5236 InstructionCost CostAfter =
5237 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5238 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5239 UniqueShuffles.clear();
5240 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5241 InstructionCost(0), AddShuffleMaskAdjustedCost);
5242 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5243 CostAfter +=
5244 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5245 InstructionCost(0), AddShuffleMaskCost);
5246
5247 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5248 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5249 << " vs CostAfter: " << CostAfter << "\n");
5250 if (CostBefore < CostAfter ||
5251 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5252 return false;
5253
5254 // The cost model has passed, create the new instructions.
5255 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5256 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5257 if (!SV)
5258 return I;
5259 if (isa<UndefValue>(SV->getOperand(1)))
5260 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5261 if (InputShuffles.contains(SSV))
5262 return SSV->getOperand(Op);
5263 return SV->getOperand(Op);
5264 };
5265 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5266 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5267 GetShuffleOperand(SVI0A, 1), V1A);
5268 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5269 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5270 GetShuffleOperand(SVI0B, 1), V1B);
5271 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5272 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5273 GetShuffleOperand(SVI1A, 1), V2A);
5274 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5275 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5276 GetShuffleOperand(SVI1B, 1), V2B);
5277 Builder.SetInsertPoint(Op0);
5278 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5279 NSV0A, NSV0B);
5280 if (auto *I = dyn_cast<Instruction>(NOp0))
5281 I->copyIRFlags(Op0, true);
5282 Builder.SetInsertPoint(Op1);
5283 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5284 NSV1A, NSV1B);
5285 if (auto *I = dyn_cast<Instruction>(NOp1))
5286 I->copyIRFlags(Op1, true);
5287
5288 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5289 Builder.SetInsertPoint(Shuffles[S]);
5290 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5291 replaceValue(*Shuffles[S], *NSV, false);
5292 }
5293
5294 Worklist.pushValue(NSV0A);
5295 Worklist.pushValue(NSV0B);
5296 Worklist.pushValue(NSV1A);
5297 Worklist.pushValue(NSV1B);
5298 return true;
5299}
5300
5301/// Check if instruction depends on ZExt and this ZExt can be moved after the
5302/// instruction. Move ZExt if it is profitable. For example:
5303/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5304/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5305/// Cost model calculations takes into account if zext(x) has other users and
5306/// whether it can be propagated through them too.
5307bool VectorCombine::shrinkType(Instruction &I) {
5308 Value *ZExted, *OtherOperand;
5309 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5310 m_Value(OtherOperand))) &&
5311 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5312 return false;
5313
5314 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5315
5316 auto *BigTy = cast<FixedVectorType>(I.getType());
5317 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5318 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5319
5320 if (I.getOpcode() == Instruction::LShr) {
5321 // Check that the shift amount is less than the number of bits in the
5322 // smaller type. Otherwise, the smaller lshr will return a poison value.
5323 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5324 if (ShAmtKB.getMaxValue().uge(BW))
5325 return false;
5326 } else {
5327 // Check that the expression overall uses at most the same number of bits as
5328 // ZExted
5329 KnownBits KB = computeKnownBits(&I, *DL);
5330 if (KB.countMaxActiveBits() > BW)
5331 return false;
5332 }
5333
5334 // Calculate costs of leaving current IR as it is and moving ZExt operation
5335 // later, along with adding truncates if needed
5337 Instruction::ZExt, BigTy, SmallTy,
5338 TargetTransformInfo::CastContextHint::None, CostKind);
5339 InstructionCost CurrentCost = ZExtCost;
5340 InstructionCost ShrinkCost = 0;
5341
5342 // Calculate total cost and check that we can propagate through all ZExt users
5343 for (User *U : ZExtOperand->users()) {
5344 auto *UI = cast<Instruction>(U);
5345 if (UI == &I) {
5346 CurrentCost +=
5347 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5348 ShrinkCost +=
5349 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5350 ShrinkCost += ZExtCost;
5351 continue;
5352 }
5353
5354 if (!Instruction::isBinaryOp(UI->getOpcode()))
5355 return false;
5356
5357 // Check if we can propagate ZExt through its other users
5358 KnownBits KB = computeKnownBits(UI, *DL);
5359 if (KB.countMaxActiveBits() > BW)
5360 return false;
5361
5362 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5363 ShrinkCost +=
5364 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5365 ShrinkCost += ZExtCost;
5366 }
5367
5368 // If the other instruction operand is not a constant, we'll need to
5369 // generate a truncate instruction. So we have to adjust cost
5370 if (!isa<Constant>(OtherOperand))
5371 ShrinkCost += TTI.getCastInstrCost(
5372 Instruction::Trunc, SmallTy, BigTy,
5373 TargetTransformInfo::CastContextHint::None, CostKind);
5374
5375 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5376 // towards modifying the IR because shrinking opens opportunities for other
5377 // shrinking optimisations.
5378 if (ShrinkCost > CurrentCost)
5379 return false;
5380
5381 Builder.SetInsertPoint(&I);
5382 Value *Op0 = ZExted;
5383 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5384 // Keep the order of operands the same
5385 if (I.getOperand(0) == OtherOperand)
5386 std::swap(Op0, Op1);
5387 Value *NewBinOp =
5388 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5389 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5390 cast<Instruction>(NewBinOp)->copyMetadata(I);
5391 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5392 replaceValue(I, *NewZExtr);
5393 return true;
5394}
5395
5396/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5397/// shuffle (DstVec, SrcVec, Mask)
5398bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5399 Value *DstVec, *SrcVec;
5400 uint64_t ExtIdx, InsIdx;
5401 if (!match(&I,
5402 m_InsertElt(m_Value(DstVec),
5403 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5404 m_ConstantInt(InsIdx))))
5405 return false;
5406
5407 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5408 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5409 // We can try combining vectors with different element sizes.
5410 if (!DstVecTy || !SrcVecTy ||
5411 SrcVecTy->getElementType() != DstVecTy->getElementType())
5412 return false;
5413
5414 unsigned NumDstElts = DstVecTy->getNumElements();
5415 unsigned NumSrcElts = SrcVecTy->getNumElements();
5416 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5417 return false;
5418
5419 // Insertion into poison is a cheaper single operand shuffle.
5421 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5422
5423 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5424 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5425 if (NeedDstSrcSwap) {
5427 Mask[InsIdx] = ExtIdx % NumDstElts;
5428 std::swap(DstVec, SrcVec);
5429 } else {
5431 std::iota(Mask.begin(), Mask.end(), 0);
5432 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5433 }
5434
5435 // Cost
5436 auto *Ins = cast<InsertElementInst>(&I);
5437 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5438 InstructionCost InsCost =
5439 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5440 InstructionCost ExtCost =
5441 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5442 InstructionCost OldCost = ExtCost + InsCost;
5443
5444 InstructionCost NewCost = 0;
5445 SmallVector<int> ExtToVecMask;
5446 if (!NeedExpOrNarrow) {
5447 // Ignore 'free' identity insertion shuffle.
5448 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5449 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5450 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5451 nullptr, {DstVec, SrcVec});
5452 } else {
5453 // When creating a length-changing-vector, always try to keep the relevant
5454 // element in an equivalent position, so that bulk shuffles are more likely
5455 // to be useful.
5456 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5457 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5458 // Add cost for expanding or narrowing
5460 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5461 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5462 }
5463
5464 if (!Ext->hasOneUse())
5465 NewCost += ExtCost;
5466
5467 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5468 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5469 << "\n");
5470
5471 if (OldCost < NewCost)
5472 return false;
5473
5474 if (NeedExpOrNarrow) {
5475 if (!NeedDstSrcSwap)
5476 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5477 else
5478 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5479 }
5480
5481 // Canonicalize undef param to RHS to help further folds.
5482 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5483 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5484 std::swap(DstVec, SrcVec);
5485 }
5486
5487 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5488 replaceValue(I, *Shuf);
5489
5490 return true;
5491}
5492
5493/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5494/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5495/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5496/// before casting it back into `<vscale x 16 x i32>`.
5497bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5498 const APInt *SplatVal0, *SplatVal1;
5500 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5501 return false;
5502
5503 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5504 << "\n");
5505
5506 auto *VTy =
5507 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5508 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5509 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5510
5511 // Just in case the cost of interleave2 intrinsic and bitcast are both
5512 // invalid, in which case we want to bail out, we use <= rather
5513 // than < here. Even they both have valid and equal costs, it's probably
5514 // not a good idea to emit a high-cost constant splat.
5516 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5518 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5519 << *I.getType() << " is too high.\n");
5520 return false;
5521 }
5522
5523 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5524 NewSplatVal <<= Width;
5525 NewSplatVal |= SplatVal0->zext(Width * 2);
5526 auto *NewSplat = ConstantVector::getSplat(
5527 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5528
5529 IRBuilder<> Builder(&I);
5530 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5531 return true;
5532}
5533
5534// Attempt to shrink loads that are only used by shufflevector instructions.
5535bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
5536 auto *OldLoad = dyn_cast<LoadInst>(&I);
5537 if (!OldLoad || !OldLoad->isSimple())
5538 return false;
5539
5540 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
5541 if (!OldLoadTy)
5542 return false;
5543
5544 unsigned const OldNumElements = OldLoadTy->getNumElements();
5545
5546 // Search all uses of load. If all uses are shufflevector instructions, and
5547 // the second operands are all poison values, find the minimum and maximum
5548 // indices of the vector elements referenced by all shuffle masks.
5549 // Otherwise return `std::nullopt`.
5550 using IndexRange = std::pair<int, int>;
5551 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
5552 IndexRange OutputRange = IndexRange(OldNumElements, -1);
5553 for (llvm::Use &Use : I.uses()) {
5554 // Ensure all uses match the required pattern.
5555 User *Shuffle = Use.getUser();
5556 ArrayRef<int> Mask;
5557
5558 if (!match(Shuffle,
5559 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
5560 return std::nullopt;
5561
5562 // Ignore shufflevector instructions that have no uses.
5563 if (Shuffle->use_empty())
5564 continue;
5565
5566 // Find the min and max indices used by the shufflevector instruction.
5567 for (int Index : Mask) {
5568 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
5569 OutputRange.first = std::min(Index, OutputRange.first);
5570 OutputRange.second = std::max(Index, OutputRange.second);
5571 }
5572 }
5573 }
5574
5575 if (OutputRange.second < OutputRange.first)
5576 return std::nullopt;
5577
5578 return OutputRange;
5579 };
5580
5581 // Get the range of vector elements used by shufflevector instructions.
5582 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
5583 unsigned const NewNumElements = Indices->second + 1u;
5584
5585 // If the range of vector elements is smaller than the full load, attempt
5586 // to create a smaller load.
5587 if (NewNumElements < OldNumElements) {
5588 IRBuilder Builder(&I);
5589 Builder.SetCurrentDebugLocation(I.getDebugLoc());
5590
5591 // Calculate costs of old and new ops.
5592 Type *ElemTy = OldLoadTy->getElementType();
5593 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
5594 Value *PtrOp = OldLoad->getPointerOperand();
5595
5597 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
5598 OldLoad->getPointerAddressSpace(), CostKind);
5599 InstructionCost NewCost =
5600 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
5601 OldLoad->getPointerAddressSpace(), CostKind);
5602
5603 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
5605 unsigned const MaxIndex = NewNumElements * 2u;
5606
5607 for (llvm::Use &Use : I.uses()) {
5608 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
5609
5610 // Ignore shufflevector instructions that have no uses.
5611 if (Shuffle->use_empty())
5612 continue;
5613
5614 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
5615
5616 // Create entry for new use.
5617 NewUses.push_back({Shuffle, OldMask});
5618
5619 // Validate mask indices.
5620 for (int Index : OldMask) {
5621 if (Index >= static_cast<int>(MaxIndex))
5622 return false;
5623 }
5624
5625 // Update costs.
5626 OldCost +=
5628 OldLoadTy, OldMask, CostKind);
5629 NewCost +=
5631 NewLoadTy, OldMask, CostKind);
5632 }
5633
5634 LLVM_DEBUG(
5635 dbgs() << "Found a load used only by shufflevector instructions: "
5636 << I << "\n OldCost: " << OldCost
5637 << " vs NewCost: " << NewCost << "\n");
5638
5639 if (OldCost < NewCost || !NewCost.isValid())
5640 return false;
5641
5642 // Create new load of smaller vector.
5643 auto *NewLoad = cast<LoadInst>(
5644 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
5645 NewLoad->copyMetadata(I);
5646
5647 // Replace all uses.
5648 for (UseEntry &Use : NewUses) {
5649 ShuffleVectorInst *Shuffle = Use.first;
5650 std::vector<int> &NewMask = Use.second;
5651
5652 Builder.SetInsertPoint(Shuffle);
5653 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
5654 Value *NewShuffle = Builder.CreateShuffleVector(
5655 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
5656
5657 replaceValue(*Shuffle, *NewShuffle, false);
5658 }
5659
5660 return true;
5661 }
5662 }
5663 return false;
5664}
5665
5666// Attempt to narrow a phi of shufflevector instructions where the two incoming
5667// values have the same operands but different masks. If the two shuffle masks
5668// are offsets of one another we can use one branch to rotate the incoming
5669// vector and perform one larger shuffle after the phi.
5670bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
5671 auto *Phi = dyn_cast<PHINode>(&I);
5672 if (!Phi || Phi->getNumIncomingValues() != 2u)
5673 return false;
5674
5675 Value *Op = nullptr;
5676 ArrayRef<int> Mask0;
5677 ArrayRef<int> Mask1;
5678
5679 if (!match(Phi->getOperand(0u),
5680 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
5681 !match(Phi->getOperand(1u),
5682 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
5683 return false;
5684
5685 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
5686
5687 // Ensure result vectors are wider than the argument vector.
5688 auto *InputVT = cast<FixedVectorType>(Op->getType());
5689 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
5690 auto const InputNumElements = InputVT->getNumElements();
5691
5692 if (InputNumElements >= ResultVT->getNumElements())
5693 return false;
5694
5695 // Take the difference of the two shuffle masks at each index. Ignore poison
5696 // values at the same index in both masks.
5697 SmallVector<int, 16> NewMask;
5698 NewMask.reserve(Mask0.size());
5699
5700 for (auto [M0, M1] : zip(Mask0, Mask1)) {
5701 if (M0 >= 0 && M1 >= 0)
5702 NewMask.push_back(M0 - M1);
5703 else if (M0 == -1 && M1 == -1)
5704 continue;
5705 else
5706 return false;
5707 }
5708
5709 // Ensure all elements of the new mask are equal. If the difference between
5710 // the incoming mask elements is the same, the two must be constant offsets
5711 // of one another.
5712 if (NewMask.empty() || !all_equal(NewMask))
5713 return false;
5714
5715 // Create new mask using difference of the two incoming masks.
5716 int MaskOffset = NewMask[0u];
5717 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
5718 NewMask.clear();
5719
5720 for (unsigned I = 0u; I < InputNumElements; ++I) {
5721 NewMask.push_back(Index);
5722 Index = (Index + 1u) % InputNumElements;
5723 }
5724
5725 // Calculate costs for worst cases and compare.
5726 auto const Kind = TTI::SK_PermuteSingleSrc;
5727 auto OldCost =
5728 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
5729 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
5730 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
5731 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
5732
5733 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
5734 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5735 << "\n");
5736
5737 if (NewCost > OldCost)
5738 return false;
5739
5740 // Create new shuffles and narrowed phi.
5741 auto Builder = IRBuilder(Shuf);
5742 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
5743 auto *PoisonVal = PoisonValue::get(InputVT);
5744 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
5745 Worklist.push(cast<Instruction>(NewShuf0));
5746
5747 Builder.SetInsertPoint(Phi);
5748 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
5749 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
5750 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
5751 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
5752
5753 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
5754 PoisonVal = PoisonValue::get(NewPhi->getType());
5755 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
5756
5757 replaceValue(*Phi, *NewShuf1);
5758 return true;
5759}
5760
5761/// This is the entry point for all transforms. Pass manager differences are
5762/// handled in the callers of this function.
5763bool VectorCombine::run() {
5765 return false;
5766
5767 // Don't attempt vectorization if the target does not support vectors.
5768 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
5769 return false;
5770
5771 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
5772
5773 auto FoldInst = [this](Instruction &I) {
5774 Builder.SetInsertPoint(&I);
5775 bool IsVectorType = isa<VectorType>(I.getType());
5776 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
5777 auto Opcode = I.getOpcode();
5778
5779 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
5780
5781 // These folds should be beneficial regardless of when this pass is run
5782 // in the optimization pipeline.
5783 // The type checking is for run-time efficiency. We can avoid wasting time
5784 // dispatching to folding functions if there's no chance of matching.
5785 if (IsFixedVectorType) {
5786 switch (Opcode) {
5787 case Instruction::InsertElement:
5788 if (vectorizeLoadInsert(I))
5789 return true;
5790 break;
5791 case Instruction::ShuffleVector:
5792 if (widenSubvectorLoad(I))
5793 return true;
5794 break;
5795 default:
5796 break;
5797 }
5798 }
5799
5800 // This transform works with scalable and fixed vectors
5801 // TODO: Identify and allow other scalable transforms
5802 if (IsVectorType) {
5803 if (scalarizeOpOrCmp(I))
5804 return true;
5805 if (scalarizeLoad(I))
5806 return true;
5807 if (scalarizeExtExtract(I))
5808 return true;
5809 if (scalarizeVPIntrinsic(I))
5810 return true;
5811 if (foldInterleaveIntrinsics(I))
5812 return true;
5813 }
5814
5815 if (Opcode == Instruction::Store)
5816 if (foldSingleElementStore(I))
5817 return true;
5818
5819 // If this is an early pipeline invocation of this pass, we are done.
5820 if (TryEarlyFoldsOnly)
5821 return false;
5822
5823 // Otherwise, try folds that improve codegen but may interfere with
5824 // early IR canonicalizations.
5825 // The type checking is for run-time efficiency. We can avoid wasting time
5826 // dispatching to folding functions if there's no chance of matching.
5827 if (IsFixedVectorType) {
5828 switch (Opcode) {
5829 case Instruction::InsertElement:
5830 if (foldInsExtFNeg(I))
5831 return true;
5832 if (foldInsExtBinop(I))
5833 return true;
5834 if (foldInsExtVectorToShuffle(I))
5835 return true;
5836 break;
5837 case Instruction::ShuffleVector:
5838 if (foldPermuteOfBinops(I))
5839 return true;
5840 if (foldShuffleOfBinops(I))
5841 return true;
5842 if (foldShuffleOfSelects(I))
5843 return true;
5844 if (foldShuffleOfCastops(I))
5845 return true;
5846 if (foldShuffleOfShuffles(I))
5847 return true;
5848 if (foldPermuteOfIntrinsic(I))
5849 return true;
5850 if (foldShufflesOfLengthChangingShuffles(I))
5851 return true;
5852 if (foldShuffleOfIntrinsics(I))
5853 return true;
5854 if (foldSelectShuffle(I))
5855 return true;
5856 if (foldShuffleToIdentity(I))
5857 return true;
5858 break;
5859 case Instruction::Load:
5860 if (shrinkLoadForShuffles(I))
5861 return true;
5862 break;
5863 case Instruction::BitCast:
5864 if (foldBitcastShuffle(I))
5865 return true;
5866 if (foldSelectsFromBitcast(I))
5867 return true;
5868 break;
5869 case Instruction::And:
5870 case Instruction::Or:
5871 case Instruction::Xor:
5872 if (foldBitOpOfCastops(I))
5873 return true;
5874 if (foldBitOpOfCastConstant(I))
5875 return true;
5876 break;
5877 case Instruction::PHI:
5878 if (shrinkPhiOfShuffles(I))
5879 return true;
5880 break;
5881 default:
5882 if (shrinkType(I))
5883 return true;
5884 break;
5885 }
5886 } else {
5887 switch (Opcode) {
5888 case Instruction::Call:
5889 if (foldShuffleFromReductions(I))
5890 return true;
5891 if (foldCastFromReductions(I))
5892 return true;
5893 break;
5894 case Instruction::ExtractElement:
5895 if (foldShuffleChainsToReduce(I))
5896 return true;
5897 break;
5898 case Instruction::ICmp:
5899 if (foldSignBitReductionCmp(I))
5900 return true;
5901 if (foldICmpEqZeroVectorReduce(I))
5902 return true;
5903 if (foldEquivalentReductionCmp(I))
5904 return true;
5905 [[fallthrough]];
5906 case Instruction::FCmp:
5907 if (foldExtractExtract(I))
5908 return true;
5909 break;
5910 case Instruction::Or:
5911 if (foldConcatOfBoolMasks(I))
5912 return true;
5913 [[fallthrough]];
5914 default:
5915 if (Instruction::isBinaryOp(Opcode)) {
5916 if (foldExtractExtract(I))
5917 return true;
5918 if (foldExtractedCmps(I))
5919 return true;
5920 if (foldBinopOfReductions(I))
5921 return true;
5922 }
5923 break;
5924 }
5925 }
5926 return false;
5927 };
5928
5929 bool MadeChange = false;
5930 for (BasicBlock &BB : F) {
5931 // Ignore unreachable basic blocks.
5932 if (!DT.isReachableFromEntry(&BB))
5933 continue;
5934 // Use early increment range so that we can erase instructions in loop.
5935 // make_early_inc_range is not applicable here, as the next iterator may
5936 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5937 // We manually maintain the next instruction and update it when it is about
5938 // to be deleted.
5939 Instruction *I = &BB.front();
5940 while (I) {
5941 NextInst = I->getNextNode();
5942 if (!I->isDebugOrPseudoInst())
5943 MadeChange |= FoldInst(*I);
5944 I = NextInst;
5945 }
5946 }
5947
5948 NextInst = nullptr;
5949
5950 while (!Worklist.isEmpty()) {
5951 Instruction *I = Worklist.removeOne();
5952 if (!I)
5953 continue;
5954
5957 continue;
5958 }
5959
5960 MadeChange |= FoldInst(*I);
5961 }
5962
5963 return MadeChange;
5964}
5965
5968 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5970 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5971 AAResults &AA = FAM.getResult<AAManager>(F);
5972 const DataLayout *DL = &F.getDataLayout();
5973 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5974 TryEarlyFoldsOnly);
5975 if (!Combiner.run())
5976 return PreservedAnalyses::all();
5979 return PA;
5980}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1448
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Value * generateNewInstTree(ArrayRef< InstLane > Item, Use *From, FixedVectorType *Ty, const DenseSet< std::pair< Value *, Use * > > &IdentityLeafs, const DenseSet< std::pair< Value *, Use * > > &SplatLeafs, const DenseSet< std::pair< Value *, Use * > > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
std::pair< Value *, int > InstLane
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, const SimplifyQuery &SQ)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static InstLane lookThroughShuffles(Value *V, int Lane)
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1054
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1638
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:33
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2602
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2590
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1912
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2668
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1547
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2251
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2692
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1993
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2276
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2483
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2514
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2687
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2217
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1895
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1526
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2095
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2624
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1585
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1908
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2081
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:629
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1742
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1833
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2459
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1607
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:737
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2277
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2282
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_and< Ty... > m_CombineAnd(const Ty &...Ps)
Combine pattern matchers matching all of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_Constant()
Match an arbitrary Constant and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
auto m_AnyIntrinsic()
Matches any intrinsic call and ignore it.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:431
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
const Instruction * CxtI
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC