LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
40#include <numeric>
41#include <optional>
42#include <queue>
43#include <set>
44
45#define DEBUG_TYPE "vector-combine"
47
48using namespace llvm;
49using namespace llvm::PatternMatch;
50
51STATISTIC(NumVecLoad, "Number of vector loads formed");
52STATISTIC(NumVecCmp, "Number of vector compares formed");
53STATISTIC(NumVecBO, "Number of vector binops formed");
54STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
55STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
56STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
57STATISTIC(NumScalarCmp, "Number of scalar compares formed");
58STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
59
61 "disable-vector-combine", cl::init(false), cl::Hidden,
62 cl::desc("Disable all vector combine transforms"));
63
65 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
66 cl::desc("Disable binop extract to shuffle transforms"));
67
69 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
70 cl::desc("Max number of instructions to scan for vector combining."));
71
72static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
73
74namespace {
75class VectorCombine {
76public:
77 VectorCombine(Function &F, const TargetTransformInfo &TTI,
80 bool TryEarlyFoldsOnly)
81 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
82 DT(DT), AA(AA), DL(DL), CostKind(CostKind),
83 SQ(*DL, /*TLI=*/nullptr, &DT, &AC),
84 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
85
86 bool run();
87
88private:
89 Function &F;
91 const TargetTransformInfo &TTI;
92 const DominatorTree &DT;
93 AAResults &AA;
94 const DataLayout *DL;
95 TTI::TargetCostKind CostKind;
96 const SimplifyQuery SQ;
97
98 /// If true, only perform beneficial early IR transforms. Do not introduce new
99 /// vector operations.
100 bool TryEarlyFoldsOnly;
101
102 InstructionWorklist Worklist;
103
104 /// Next instruction to iterate. It will be updated when it is erased by
105 /// RecursivelyDeleteTriviallyDeadInstructions.
106 Instruction *NextInst;
107
108 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
109 // parameter. That should be updated to specific sub-classes because the
110 // run loop was changed to dispatch on opcode.
111 bool vectorizeLoadInsert(Instruction &I);
112 bool widenSubvectorLoad(Instruction &I);
113 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
114 ExtractElementInst *Ext1,
115 unsigned PreferredExtractIndex) const;
116 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
117 const Instruction &I,
118 ExtractElementInst *&ConvertToShuffle,
119 unsigned PreferredExtractIndex);
120 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
122 bool foldExtractExtract(Instruction &I);
123 bool foldInsExtFNeg(Instruction &I);
124 bool foldInsExtBinop(Instruction &I);
125 bool foldInsExtVectorToShuffle(Instruction &I);
126 bool foldBitOpOfCastops(Instruction &I);
127 bool foldBitOpOfCastConstant(Instruction &I);
128 bool foldBitcastShuffle(Instruction &I);
129 bool scalarizeOpOrCmp(Instruction &I);
130 bool scalarizeVPIntrinsic(Instruction &I);
131 bool foldExtractedCmps(Instruction &I);
132 bool foldSelectsFromBitcast(Instruction &I);
133 bool foldBinopOfReductions(Instruction &I);
134 bool foldSingleElementStore(Instruction &I);
135 bool scalarizeLoad(Instruction &I);
136 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
138 bool scalarizeExtExtract(Instruction &I);
139 bool foldConcatOfBoolMasks(Instruction &I);
140 bool foldPermuteOfBinops(Instruction &I);
141 bool foldShuffleOfBinops(Instruction &I);
142 bool foldShuffleOfSelects(Instruction &I);
143 bool foldShuffleOfCastops(Instruction &I);
144 bool foldShuffleOfShuffles(Instruction &I);
145 bool foldPermuteOfIntrinsic(Instruction &I);
146 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
147 bool foldShuffleOfIntrinsics(Instruction &I);
148 bool foldShuffleToIdentity(Instruction &I);
149 bool foldShuffleFromReductions(Instruction &I);
150 bool foldShuffleChainsToReduce(Instruction &I);
151 bool foldCastFromReductions(Instruction &I);
152 bool foldSignBitReductionCmp(Instruction &I);
153 bool foldReductionZeroTest(Instruction &I);
154 bool foldICmpEqZeroVectorReduce(Instruction &I);
155 bool foldEquivalentReductionCmp(Instruction &I);
156 bool foldReduceAddCmpZero(Instruction &I);
157 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
158 bool foldInterleaveIntrinsics(Instruction &I);
159 bool foldDeinterleaveIntrinsics(Instruction &I);
160 bool foldBitcastOfVPLoad(Instruction &I);
161 bool foldBitOrderReverseAndSwap(Instruction &I);
162 bool shrinkType(Instruction &I);
163 bool shrinkLoadForShuffles(Instruction &I);
164 bool shrinkPhiOfShuffles(Instruction &I);
165
166 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
167 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
168 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
169 Old.replaceAllUsesWith(&New);
170 if (auto *NewI = dyn_cast<Instruction>(&New)) {
171 New.takeName(&Old);
172 Worklist.pushUsersToWorkList(*NewI);
173 Worklist.pushValue(NewI);
174 }
175 if (Erase && isInstructionTriviallyDead(&Old)) {
176 eraseInstruction(Old);
177 } else {
178 Worklist.push(&Old);
179 }
180 }
181
182 void eraseInstruction(Instruction &I) {
183 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
184 SmallVector<Value *> Ops(I.operands());
185 Worklist.remove(&I);
186 I.eraseFromParent();
187
188 // Push remaining users of the operands and then the operand itself - allows
189 // further folds that were hindered by OneUse limits.
190 SmallPtrSet<Value *, 4> Visited;
191 for (Value *Op : Ops) {
192 if (!Visited.contains(Op)) {
193 if (auto *OpI = dyn_cast<Instruction>(Op)) {
195 OpI, nullptr, nullptr, [&](Value *V) {
196 if (auto *I = dyn_cast<Instruction>(V)) {
197 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
198 Worklist.remove(I);
199 if (I == NextInst)
200 NextInst = NextInst->getNextNode();
201 Visited.insert(I);
202 }
203 }))
204 continue;
205 Worklist.pushUsersToWorkList(*OpI);
206 Worklist.pushValue(OpI);
207 }
208 }
209 }
210 }
211};
212} // namespace
213
214/// Return the source operand of a potentially bitcasted value. If there is no
215/// bitcast, return the input value itself.
217 while (auto *BitCast = dyn_cast<BitCastInst>(V))
218 V = BitCast->getOperand(0);
219 return V;
220}
221
222static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
223 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
224 // The widened load may load data from dirty regions or create data races
225 // non-existent in the source.
226 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
227 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
229 return false;
230
231 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
232 // sure we have all of our type-based constraints in place for this target.
233 Type *ScalarTy = Load->getType()->getScalarType();
234 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
235 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
236 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
237 ScalarSize % 8 != 0)
238 return false;
239
240 return true;
241}
242
243bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
244 // Match insert into fixed vector of scalar value.
245 // TODO: Handle non-zero insert index.
246 Value *Scalar;
247 if (!match(&I,
249 return false;
250
251 // Optionally match an extract from another vector.
252 Value *X;
253 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
254 if (!HasExtract)
255 X = Scalar;
256
257 auto *Load = dyn_cast<LoadInst>(X);
258 if (!canWidenLoad(Load, TTI))
259 return false;
260
261 Type *ScalarTy = Scalar->getType();
262 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
263 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
264
265 // Check safety of replacing the scalar load with a larger vector load.
266 // We use minimal alignment (maximum flexibility) because we only care about
267 // the dereferenceable region. When calculating cost and creating a new op,
268 // we may use a larger value based on alignment attributes.
269 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
270 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
271
272 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
273 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
274 unsigned OffsetEltIndex = 0;
275 Align Alignment = Load->getAlign();
276 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, SQ.AC,
277 SQ.DT)) {
278 // It is not safe to load directly from the pointer, but we can still peek
279 // through gep offsets and check if it safe to load from a base address with
280 // updated alignment. If it is, we can shuffle the element(s) into place
281 // after loading.
282 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
283 APInt Offset(OffsetBitWidth, 0);
285
286 // We want to shuffle the result down from a high element of a vector, so
287 // the offset must be positive.
288 if (Offset.isNegative())
289 return false;
290
291 // The offset must be a multiple of the scalar element to shuffle cleanly
292 // in the element's size.
293 uint64_t ScalarSizeInBytes = ScalarSize / 8;
294 if (Offset.urem(ScalarSizeInBytes) != 0)
295 return false;
296
297 // If we load MinVecNumElts, will our target element still be loaded?
298 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
299 if (OffsetEltIndex >= MinVecNumElts)
300 return false;
301
302 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load,
303 SQ.AC, SQ.DT))
304 return false;
305
306 // Update alignment with offset value. Note that the offset could be negated
307 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
308 // negation does not change the result of the alignment calculation.
309 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
310 }
311
312 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
313 // Use the greater of the alignment on the load or its source pointer.
314 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
315 Type *LoadTy = Load->getType();
316 unsigned AS = Load->getPointerAddressSpace();
317 InstructionCost OldCost =
318 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
319 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
320 OldCost +=
321 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
322 /* Insert */ true, HasExtract, CostKind);
323
324 // New pattern: load VecPtr
325 InstructionCost NewCost =
326 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
327 // Optionally, we are shuffling the loaded vector element(s) into place.
328 // For the mask set everything but element 0 to undef to prevent poison from
329 // propagating from the extra loaded memory. This will also optionally
330 // shrink/grow the vector from the loaded size to the output size.
331 // We assume this operation has no cost in codegen if there was no offset.
332 // Note that we could use freeze to avoid poison problems, but then we might
333 // still need a shuffle to change the vector size.
334 auto *Ty = cast<FixedVectorType>(I.getType());
335 unsigned OutputNumElts = Ty->getNumElements();
336 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
337 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
338 Mask[0] = OffsetEltIndex;
339 if (OffsetEltIndex)
340 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
341 CostKind);
342
343 // We can aggressively convert to the vector form because the backend can
344 // invert this transform if it does not result in a performance win.
345 if (OldCost < NewCost || !NewCost.isValid())
346 return false;
347
348 // It is safe and potentially profitable to load a vector directly:
349 // inselt undef, load Scalar, 0 --> load VecPtr
350 IRBuilder<> Builder(Load);
351 Value *CastedPtr =
352 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
353 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
354 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
355
356 replaceValue(I, *VecLd);
357 ++NumVecLoad;
358 return true;
359}
360
361/// If we are loading a vector and then inserting it into a larger vector with
362/// undefined elements, try to load the larger vector and eliminate the insert.
363/// This removes a shuffle in IR and may allow combining of other loaded values.
364bool VectorCombine::widenSubvectorLoad(Instruction &I) {
365 // Match subvector insert of fixed vector.
366 auto *Shuf = cast<ShuffleVectorInst>(&I);
367 if (!Shuf->isIdentityWithPadding())
368 return false;
369
370 // Allow a non-canonical shuffle mask that is choosing elements from op1.
371 unsigned NumOpElts =
372 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
373 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
374 return M >= (int)(NumOpElts);
375 });
376
377 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
378 if (!canWidenLoad(Load, TTI))
379 return false;
380
381 // We use minimal alignment (maximum flexibility) because we only care about
382 // the dereferenceable region. When calculating cost and creating a new op,
383 // we may use a larger value based on alignment attributes.
384 auto *Ty = cast<FixedVectorType>(I.getType());
385 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
386 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
387 Align Alignment = Load->getAlign();
388 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, SQ.AC,
389 SQ.DT))
390 return false;
391
392 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
393 Type *LoadTy = Load->getType();
394 unsigned AS = Load->getPointerAddressSpace();
395
396 // Original pattern: insert_subvector (load PtrOp)
397 // This conservatively assumes that the cost of a subvector insert into an
398 // undef value is 0. We could add that cost if the cost model accurately
399 // reflects the real cost of that operation.
400 InstructionCost OldCost =
401 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
402
403 // New pattern: load PtrOp
404 InstructionCost NewCost =
405 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
406
407 // We can aggressively convert to the vector form because the backend can
408 // invert this transform if it does not result in a performance win.
409 if (OldCost < NewCost || !NewCost.isValid())
410 return false;
411
412 IRBuilder<> Builder(Load);
413 Value *CastedPtr =
414 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
415 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
416 replaceValue(I, *VecLd);
417 ++NumVecLoad;
418 return true;
419}
420
421/// Determine which, if any, of the inputs should be replaced by a shuffle
422/// followed by extract from a different index.
423ExtractElementInst *VectorCombine::getShuffleExtract(
424 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
425 unsigned PreferredExtractIndex = InvalidIndex) const {
426 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
427 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
428 assert(Index0C && Index1C && "Expected constant extract indexes");
429
430 unsigned Index0 = Index0C->getZExtValue();
431 unsigned Index1 = Index1C->getZExtValue();
432
433 // If the extract indexes are identical, no shuffle is needed.
434 if (Index0 == Index1)
435 return nullptr;
436
437 Type *VecTy = Ext0->getVectorOperand()->getType();
438 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
439 InstructionCost Cost0 =
440 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
441 InstructionCost Cost1 =
442 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
443
444 // If both costs are invalid no shuffle is needed
445 if (!Cost0.isValid() && !Cost1.isValid())
446 return nullptr;
447
448 // We are extracting from 2 different indexes, so one operand must be shuffled
449 // before performing a vector operation and/or extract. The more expensive
450 // extract will be replaced by a shuffle.
451 if (Cost0 > Cost1)
452 return Ext0;
453 if (Cost1 > Cost0)
454 return Ext1;
455
456 // If the costs are equal and there is a preferred extract index, shuffle the
457 // opposite operand.
458 if (PreferredExtractIndex == Index0)
459 return Ext1;
460 if (PreferredExtractIndex == Index1)
461 return Ext0;
462
463 // Otherwise, replace the extract with the higher index.
464 return Index0 > Index1 ? Ext0 : Ext1;
465}
466
467/// Compare the relative costs of 2 extracts followed by scalar operation vs.
468/// vector operation(s) followed by extract. Return true if the existing
469/// instructions are cheaper than a vector alternative. Otherwise, return false
470/// and if one of the extracts should be transformed to a shufflevector, set
471/// \p ConvertToShuffle to that extract instruction.
472bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
473 ExtractElementInst *Ext1,
474 const Instruction &I,
475 ExtractElementInst *&ConvertToShuffle,
476 unsigned PreferredExtractIndex) {
477 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
478 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
479 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
480
481 unsigned Opcode = I.getOpcode();
482 Value *Ext0Src = Ext0->getVectorOperand();
483 Value *Ext1Src = Ext1->getVectorOperand();
484 Type *ScalarTy = Ext0->getType();
485 auto *VecTy = cast<VectorType>(Ext0Src->getType());
486 InstructionCost ScalarOpCost, VectorOpCost;
487
488 // Get cost estimates for scalar and vector versions of the operation.
489 bool IsBinOp = Instruction::isBinaryOp(Opcode);
490 if (IsBinOp) {
491 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
492 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
493 } else {
494 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
495 "Expected a compare");
496 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
497 ScalarOpCost = TTI.getCmpSelInstrCost(
498 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
499 VectorOpCost = TTI.getCmpSelInstrCost(
500 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
501 }
502
503 // Get cost estimates for the extract elements. These costs will factor into
504 // both sequences.
505 unsigned Ext0Index = Ext0IndexC->getZExtValue();
506 unsigned Ext1Index = Ext1IndexC->getZExtValue();
507
508 InstructionCost Extract0Cost =
509 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
510 InstructionCost Extract1Cost =
511 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
512
513 // A more expensive extract will always be replaced by a splat shuffle.
514 // For example, if Ext0 is more expensive:
515 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
516 // extelt (opcode (splat V0, Ext0), V1), Ext1
517 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
518 // check the cost of creating a broadcast shuffle and shuffling both
519 // operands to element 0.
520 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
521 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
522 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
523
524 // Extra uses of the extracts mean that we include those costs in the
525 // vector total because those instructions will not be eliminated.
526 InstructionCost OldCost, NewCost;
527 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
528 // Handle a special case. If the 2 extracts are identical, adjust the
529 // formulas to account for that. The extra use charge allows for either the
530 // CSE'd pattern or an unoptimized form with identical values:
531 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
532 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
533 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
534 OldCost = CheapExtractCost + ScalarOpCost;
535 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
536 } else {
537 // Handle the general case. Each extract is actually a different value:
538 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
539 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
540 NewCost = VectorOpCost + CheapExtractCost +
541 !Ext0->hasOneUse() * Extract0Cost +
542 !Ext1->hasOneUse() * Extract1Cost;
543 }
544
545 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
546 if (ConvertToShuffle) {
547 if (IsBinOp && DisableBinopExtractShuffle)
548 return true;
549
550 // If we are extracting from 2 different indexes, then one operand must be
551 // shuffled before performing the vector operation. The shuffle mask is
552 // poison except for 1 lane that is being translated to the remaining
553 // extraction lane. Therefore, it is a splat shuffle. Ex:
554 // ShufMask = { poison, poison, 0, poison }
555 // TODO: The cost model has an option for a "broadcast" shuffle
556 // (splat-from-element-0), but no option for a more general splat.
557 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
558 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
560 ShuffleMask[BestInsIndex] = BestExtIndex;
562 VecTy, VecTy, ShuffleMask, CostKind, 0,
563 nullptr, {ConvertToShuffle});
564 } else {
566 VecTy, VecTy, {}, CostKind, 0, nullptr,
567 {ConvertToShuffle});
568 }
569 }
570
571 LLVM_DEBUG(dbgs() << "Found a binop of extractions: " << I << "\n OldCost: "
572 << OldCost << " vs NewCost: " << NewCost << "\n");
573
574 // Aggressively form a vector op if the cost is equal because the transform
575 // may enable further optimization.
576 // Codegen can reverse this transform (scalarize) if it was not profitable.
577 return OldCost < NewCost;
578}
579
580/// Create a shuffle that translates (shifts) 1 element from the input vector
581/// to a new element location.
582static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
583 unsigned NewIndex, IRBuilderBase &Builder) {
584 // The shuffle mask is poison except for 1 lane that is being translated
585 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
586 // ShufMask = { 2, poison, poison, poison }
587 auto *VecTy = cast<FixedVectorType>(Vec->getType());
588 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
589 ShufMask[NewIndex] = OldIndex;
590 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
591}
592
593/// Given an extract element instruction with constant index operand, shuffle
594/// the source vector (shift the scalar element) to a NewIndex for extraction.
595/// Return null if the input can be constant folded, so that we are not creating
596/// unnecessary instructions.
597static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
598 IRBuilderBase &Builder) {
599 // Shufflevectors can only be created for fixed-width vectors.
600 Value *X = ExtElt->getVectorOperand();
601 if (!isa<FixedVectorType>(X->getType()))
602 return nullptr;
603
604 // If the extract can be constant-folded, this code is unsimplified. Defer
605 // to other passes to handle that.
606 Value *C = ExtElt->getIndexOperand();
607 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
608 if (isa<Constant>(X))
609 return nullptr;
610
611 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
612 NewIndex, Builder);
613 return Shuf;
614}
615
616/// Try to reduce extract element costs by converting scalar compares to vector
617/// compares followed by extract.
618/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
619Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
620 Instruction &I) {
621 assert(isa<CmpInst>(&I) && "Expected a compare");
622
623 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
624 // --> extelt (cmp Pred V0, V1), ExtIndex
625 ++NumVecCmp;
626 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
627 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
628 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
629}
630
631/// Try to reduce extract element costs by converting scalar binops to vector
632/// binops followed by extract.
633/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
634Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
635 Instruction &I) {
636 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
637
638 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
639 // --> extelt (bo V0, V1), ExtIndex
640 ++NumVecBO;
641 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
642 V1, "foldExtExtBinop");
643
644 // All IR flags are safe to back-propagate because any potential poison
645 // created in unused vector elements is discarded by the extract.
646 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
647 VecBOInst->copyIRFlags(&I);
648
649 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
650}
651
652/// Match an instruction with extracted vector operands.
653bool VectorCombine::foldExtractExtract(Instruction &I) {
654 // It is not safe to transform things like div, urem, etc. because we may
655 // create undefined behavior when executing those on unknown vector elements.
657 return false;
658
659 Instruction *I0, *I1;
660 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
661 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
663 return false;
664
665 Value *V0, *V1;
666 uint64_t C0, C1;
667 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
669 V0->getType() != V1->getType())
670 return false;
671
672 // For fixed-width vectors, reject out-of-bounds extract indexes
673 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(V0->getType())) {
674 unsigned NumElts = FixedVecTy->getNumElements();
675 if (C0 >= NumElts || C1 >= NumElts)
676 return false;
677 }
678
679 // If the scalar value 'I' is going to be re-inserted into a vector, then try
680 // to create an extract to that same element. The extract/insert can be
681 // reduced to a "select shuffle".
682 // TODO: If we add a larger pattern match that starts from an insert, this
683 // probably becomes unnecessary.
684 auto *Ext0 = cast<ExtractElementInst>(I0);
685 auto *Ext1 = cast<ExtractElementInst>(I1);
686 uint64_t InsertIndex = InvalidIndex;
687 if (I.hasOneUse())
688 match(I.user_back(),
689 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
690
691 ExtractElementInst *ExtractToChange;
692 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
693 return false;
694
695 Value *ExtOp0 = Ext0->getVectorOperand();
696 Value *ExtOp1 = Ext1->getVectorOperand();
697
698 if (ExtractToChange) {
699 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
700 Value *NewExtOp =
701 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
702 if (!NewExtOp)
703 return false;
704 if (ExtractToChange == Ext0)
705 ExtOp0 = NewExtOp;
706 else
707 ExtOp1 = NewExtOp;
708 }
709
710 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
711 : Ext0->getIndexOperand();
712 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
713 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
714 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
715 Worklist.push(Ext0);
716 Worklist.push(Ext1);
717 replaceValue(I, *NewExt);
718 return true;
719}
720
721/// Try to replace an extract + scalar fneg + insert with a vector fneg +
722/// shuffle.
723bool VectorCombine::foldInsExtFNeg(Instruction &I) {
724 // Match an insert (op (extract)) pattern.
725 Value *DstVec;
726 uint64_t ExtIdx, InsIdx;
727 Instruction *FNeg;
728 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
729 m_ConstantInt(InsIdx))))
730 return false;
731
732 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
733 Value *SrcVec;
734 Instruction *Extract;
735 if (!match(FNeg, m_FNeg(m_CombineAnd(
736 m_Instruction(Extract),
737 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
738 return false;
739
740 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
741 auto *DstVecScalarTy = DstVecTy->getScalarType();
742 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
743 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
744 return false;
745
746 // Ignore if insert/extract index is out of bounds or destination vector has
747 // one element
748 unsigned NumDstElts = DstVecTy->getNumElements();
749 unsigned NumSrcElts = SrcVecTy->getNumElements();
750 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
751 return false;
752
753 // We are inserting the negated element into the same lane that we extracted
754 // from. This is equivalent to a select-shuffle that chooses all but the
755 // negated element from the destination vector.
756 SmallVector<int> Mask(NumDstElts);
757 std::iota(Mask.begin(), Mask.end(), 0);
758 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
759 InstructionCost OldCost =
760 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
761 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
762
763 // If the extract has one use, it will be eliminated, so count it in the
764 // original cost. If it has more than one use, ignore the cost because it will
765 // be the same before/after.
766 if (Extract->hasOneUse())
767 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
768
769 InstructionCost NewCost =
770 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
772 DstVecTy, Mask, CostKind);
773
774 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
775 // If the lengths of the two vectors are not equal,
776 // we need to add a length-change vector. Add this cost.
777 SmallVector<int> SrcMask;
778 if (NeedLenChg) {
779 SrcMask.assign(NumDstElts, PoisonMaskElem);
780 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
782 DstVecTy, SrcVecTy, SrcMask, CostKind);
783 }
784
785 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
786 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
787 << "\n");
788 if (NewCost > OldCost)
789 return false;
790
791 Value *NewShuf, *LenChgShuf = nullptr;
792 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
793 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
794 if (NeedLenChg) {
795 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
796 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
797 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
798 Worklist.pushValue(LenChgShuf);
799 } else {
800 // shuffle DstVec, (fneg SrcVec), Mask
801 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
802 }
803
804 Worklist.pushValue(VecFNeg);
805 replaceValue(I, *NewShuf);
806 return true;
807}
808
809/// Try to fold insert(binop(x,y),binop(a,b),idx)
810/// --> binop(insert(x,a,idx),insert(y,b,idx))
811bool VectorCombine::foldInsExtBinop(Instruction &I) {
812 BinaryOperator *VecBinOp, *SclBinOp;
813 uint64_t Index;
814 if (!match(&I,
815 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
816 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
817 return false;
818
819 // TODO: Add support for addlike etc.
820 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
821 if (BinOpcode != SclBinOp->getOpcode())
822 return false;
823
824 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
825 if (!ResultTy)
826 return false;
827
828 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
829 // shuffle?
830
832 TTI.getInstructionCost(VecBinOp, CostKind) +
834 InstructionCost NewCost =
835 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
836 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
837 Index, VecBinOp->getOperand(0),
838 SclBinOp->getOperand(0)) +
839 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
840 Index, VecBinOp->getOperand(1),
841 SclBinOp->getOperand(1));
842
843 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
844 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
845 << "\n");
846 if (NewCost > OldCost)
847 return false;
848
849 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
850 SclBinOp->getOperand(0), Index);
851 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
852 SclBinOp->getOperand(1), Index);
853 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
854
855 // Intersect flags from the old binops.
856 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
857 NewInst->copyIRFlags(VecBinOp);
858 NewInst->andIRFlags(SclBinOp);
859 }
860
861 Worklist.pushValue(NewIns0);
862 Worklist.pushValue(NewIns1);
863 replaceValue(I, *NewBO);
864 return true;
865}
866
867/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
868/// Supports: bitcast, trunc, sext, zext
869bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
870 // Check if this is a bitwise logic operation
871 auto *BinOp = dyn_cast<BinaryOperator>(&I);
872 if (!BinOp || !BinOp->isBitwiseLogicOp())
873 return false;
874
875 // Get the cast instructions
876 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
877 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
878 if (!LHSCast || !RHSCast) {
879 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
880 return false;
881 }
882
883 // Both casts must be the same type
884 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
885 if (CastOpcode != RHSCast->getOpcode())
886 return false;
887
888 // Only handle supported cast operations
889 switch (CastOpcode) {
890 case Instruction::BitCast:
891 case Instruction::Trunc:
892 case Instruction::SExt:
893 case Instruction::ZExt:
894 break;
895 default:
896 return false;
897 }
898
899 Value *LHSSrc = LHSCast->getOperand(0);
900 Value *RHSSrc = RHSCast->getOperand(0);
901
902 // Source types must match
903 if (LHSSrc->getType() != RHSSrc->getType())
904 return false;
905
906 auto *SrcTy = LHSSrc->getType();
907 auto *DstTy = I.getType();
908 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
909 // Other casts only handle vector types with integer elements.
910 if (CastOpcode != Instruction::BitCast &&
911 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
912 return false;
913
914 // Only integer scalar/vector values are legal for bitwise logic operations.
915 if (!SrcTy->getScalarType()->isIntegerTy() ||
916 !DstTy->getScalarType()->isIntegerTy())
917 return false;
918
919 // Cost Check :
920 // OldCost = bitlogic + 2*casts
921 // NewCost = bitlogic + cast
922
923 // Calculate specific costs for each cast with instruction context
925 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
927 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
928
929 InstructionCost OldCost =
930 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
931 LHSCastCost + RHSCastCost;
932
933 // For new cost, we can't provide an instruction (it doesn't exist yet)
934 InstructionCost GenericCastCost = TTI.getCastInstrCost(
935 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
936
937 InstructionCost NewCost =
938 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
939 GenericCastCost;
940
941 // Account for multi-use casts using specific costs
942 if (!LHSCast->hasOneUse())
943 NewCost += LHSCastCost;
944 if (!RHSCast->hasOneUse())
945 NewCost += RHSCastCost;
946
947 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
948 << " NewCost=" << NewCost << "\n");
949
950 if (NewCost > OldCost)
951 return false;
952
953 // Create the operation on the source type
954 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
955 BinOp->getName() + ".inner");
956 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
957 NewBinOp->copyIRFlags(BinOp);
958
959 Worklist.pushValue(NewOp);
960
961 // Create the cast operation directly to ensure we get a new instruction
962 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
963
964 // Preserve cast instruction flags
965 NewCast->copyIRFlags(LHSCast);
966 NewCast->andIRFlags(RHSCast);
967
968 // Insert the new instruction
969 Value *Result = Builder.Insert(NewCast);
970
971 replaceValue(I, *Result);
972 return true;
973}
974
975/// Match:
976// bitop(castop(x), C) ->
977// bitop(castop(x), castop(InvC)) ->
978// castop(bitop(x, InvC))
979// Supports: bitcast
980bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
982 Constant *C;
983
984 // Check if this is a bitwise logic operation
986 return false;
987
988 // Get the cast instructions
989 auto *LHSCast = dyn_cast<CastInst>(LHS);
990 if (!LHSCast)
991 return false;
992
993 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
994
995 // Only handle supported cast operations
996 switch (CastOpcode) {
997 case Instruction::BitCast:
998 case Instruction::ZExt:
999 case Instruction::SExt:
1000 case Instruction::Trunc:
1001 break;
1002 default:
1003 return false;
1004 }
1005
1006 Value *LHSSrc = LHSCast->getOperand(0);
1007
1008 auto *SrcTy = LHSSrc->getType();
1009 auto *DstTy = I.getType();
1010 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
1011 // Other casts only handle vector types with integer elements.
1012 if (CastOpcode != Instruction::BitCast &&
1013 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
1014 return false;
1015
1016 // Only integer scalar/vector values are legal for bitwise logic operations.
1017 if (!SrcTy->getScalarType()->isIntegerTy() ||
1018 !DstTy->getScalarType()->isIntegerTy())
1019 return false;
1020
1021 // Find the constant InvC, such that castop(InvC) equals to C.
1022 PreservedCastFlags RHSFlags;
1023 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1024 if (!InvC)
1025 return false;
1026
1027 // Cost Check :
1028 // OldCost = bitlogic + cast
1029 // NewCost = bitlogic + cast
1030
1031 // Calculate specific costs for each cast with instruction context
1032 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1033 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1034
1035 InstructionCost OldCost =
1036 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1037
1038 // For new cost, we can't provide an instruction (it doesn't exist yet)
1039 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1040 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1041
1042 InstructionCost NewCost =
1043 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1044 GenericCastCost;
1045
1046 // Account for multi-use casts using specific costs
1047 if (!LHSCast->hasOneUse())
1048 NewCost += LHSCastCost;
1049
1050 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1051 << " NewCost=" << NewCost << "\n");
1052
1053 if (NewCost > OldCost)
1054 return false;
1055
1056 // Create the operation on the source type
1057 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1058 LHSSrc, InvC, I.getName() + ".inner");
1059 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1060 NewBinOp->copyIRFlags(&I);
1061
1062 Worklist.pushValue(NewOp);
1063
1064 // Create the cast operation directly to ensure we get a new instruction
1065 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1066
1067 // Preserve cast instruction flags
1068 if (RHSFlags.NNeg)
1069 NewCast->setNonNeg();
1070 if (RHSFlags.NUW)
1071 NewCast->setHasNoUnsignedWrap();
1072 if (RHSFlags.NSW)
1073 NewCast->setHasNoSignedWrap();
1074
1075 NewCast->andIRFlags(LHSCast);
1076
1077 // Insert the new instruction
1078 Value *Result = Builder.Insert(NewCast);
1079
1080 replaceValue(I, *Result);
1081 return true;
1082}
1083
1084/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1085/// destination type followed by shuffle. This can enable further transforms by
1086/// moving bitcasts or shuffles together.
1087bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1088 Value *V0, *V1;
1089 ArrayRef<int> Mask;
1090 if (!match(&I, m_BitCast(m_OneUse(
1091 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1092 return false;
1093
1094 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1095 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1096 // mask for scalable type is a splat or not.
1097 // 2) Disallow non-vector casts.
1098 // TODO: We could allow any shuffle.
1099 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1100 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1101 if (!DestTy || !SrcTy)
1102 return false;
1103
1104 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1105 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1106 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1107 return false;
1108
1109 bool IsUnary = isa<UndefValue>(V1);
1110
1111 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1112 // if it won't increase the number of bitcasts.
1113 if (!IsUnary) {
1116 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1117 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1118 return false;
1119 }
1120
1121 SmallVector<int, 16> NewMask;
1122 if (DestEltSize <= SrcEltSize) {
1123 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1124 // always be expanded to the equivalent form choosing narrower elements.
1125 if (SrcEltSize % DestEltSize != 0)
1126 return false;
1127 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1128 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1129 } else {
1130 // The bitcast is from narrow elements to wide elements. The shuffle mask
1131 // must choose consecutive elements to allow casting first.
1132 if (DestEltSize % SrcEltSize != 0)
1133 return false;
1134 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1135 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1136 return false;
1137 }
1138
1139 // Bitcast the shuffle src - keep its original width but using the destination
1140 // scalar type.
1141 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1142 auto *NewShuffleTy =
1143 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1144 auto *OldShuffleTy =
1145 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1146 unsigned NumOps = IsUnary ? 1 : 2;
1147
1148 // The new shuffle must not cost more than the old shuffle.
1152
1153 InstructionCost NewCost =
1154 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1155 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1156 TargetTransformInfo::CastContextHint::None,
1157 CostKind));
1158 InstructionCost OldCost =
1159 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1160 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1161 TargetTransformInfo::CastContextHint::None,
1162 CostKind);
1163
1164 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1165 << OldCost << " vs NewCost: " << NewCost << "\n");
1166
1167 if (NewCost > OldCost || !NewCost.isValid())
1168 return false;
1169
1170 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1171 ++NumShufOfBitcast;
1172 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1173 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1174 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1175 replaceValue(I, *Shuf);
1176 return true;
1177}
1178
1179/// VP Intrinsics whose vector operands are both splat values may be simplified
1180/// into the scalar version of the operation and the result splatted. This
1181/// can lead to scalarization down the line.
1182bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1183 if (!isa<VPIntrinsic>(I))
1184 return false;
1185 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1186 Value *Op0 = VPI.getArgOperand(0);
1187 Value *Op1 = VPI.getArgOperand(1);
1188
1189 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1190 return false;
1191
1192 // Check getSplatValue early in this function, to avoid doing unnecessary
1193 // work.
1194 Value *ScalarOp0 = getSplatValue(Op0);
1195 Value *ScalarOp1 = getSplatValue(Op1);
1196 if (!ScalarOp0 || !ScalarOp1)
1197 return false;
1198
1199 // For the binary VP intrinsics supported here, the result on disabled lanes
1200 // is a poison value. For now, only do this simplification if all lanes
1201 // are active.
1202 // TODO: Relax the condition that all lanes are active by using insertelement
1203 // on inactive lanes.
1204 auto IsAllTrueMask = [](Value *MaskVal) {
1205 if (Value *SplattedVal = getSplatValue(MaskVal))
1206 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1207 return ConstValue->isAllOnesValue();
1208 return false;
1209 };
1210 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1211 return false;
1212
1213 // Check to make sure we support scalarization of the intrinsic
1214 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1215 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1216 return false;
1217
1218 // Calculate cost of splatting both operands into vectors and the vector
1219 // intrinsic
1220 VectorType *VecTy = cast<VectorType>(VPI.getType());
1221 SmallVector<int> Mask;
1222 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1223 Mask.resize(FVTy->getNumElements(), 0);
1224 InstructionCost SplatCost =
1225 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1227 CostKind);
1228
1229 // Calculate the cost of the VP Intrinsic
1231 for (Value *V : VPI.args())
1232 Args.push_back(V->getType());
1233 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1234 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1235 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1236
1237 // Determine scalar opcode
1238 std::optional<unsigned> FunctionalOpcode =
1239 VPI.getFunctionalOpcode();
1240 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1241 if (!FunctionalOpcode) {
1242 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1243 if (!ScalarIntrID)
1244 return false;
1245 }
1246
1247 // Calculate cost of scalarizing
1248 InstructionCost ScalarOpCost = 0;
1249 if (ScalarIntrID) {
1250 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1251 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1252 } else {
1253 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1254 VecTy->getScalarType(), CostKind);
1255 }
1256
1257 // The existing splats may be kept around if other instructions use them.
1258 InstructionCost CostToKeepSplats =
1259 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1260 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1261
1262 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1263 << "\n");
1264 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1265 << ", Cost of scalarizing:" << NewCost << "\n");
1266
1267 // We want to scalarize unless the vector variant actually has lower cost.
1268 if (OldCost < NewCost || !NewCost.isValid())
1269 return false;
1270
1271 // Scalarize the intrinsic
1272 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1273 Value *EVL = VPI.getArgOperand(3);
1274
1275 // If the VP op might introduce UB or poison, we can scalarize it provided
1276 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1277 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1278 // scalarizing it.
1279 bool SafeToSpeculate;
1280 if (ScalarIntrID)
1281 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1282 .hasAttribute(Attribute::AttrKind::Speculatable);
1283 else
1285 *FunctionalOpcode, &VPI, nullptr, SQ.AC, SQ.DT);
1286 if (!SafeToSpeculate &&
1287 !isKnownNonZero(EVL, SimplifyQuery(*DL, SQ.DT, SQ.AC, &VPI)))
1288 return false;
1289
1290 Value *ScalarVal =
1291 ScalarIntrID
1292 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1293 {ScalarOp0, ScalarOp1})
1294 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1295 ScalarOp0, ScalarOp1);
1296
1297 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1298 return true;
1299}
1300
1301/// Match a vector op/compare/intrinsic with at least one
1302/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1303/// by insertelement.
1304bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1305 auto *UO = dyn_cast<UnaryOperator>(&I);
1306 auto *BO = dyn_cast<BinaryOperator>(&I);
1307 auto *CI = dyn_cast<CmpInst>(&I);
1308 auto *II = dyn_cast<IntrinsicInst>(&I);
1309 if (!UO && !BO && !CI && !II)
1310 return false;
1311
1312 // TODO: Allow intrinsics with different argument types
1313 if (II) {
1314 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1315 return false;
1316 for (auto [Idx, Arg] : enumerate(II->args()))
1317 if (Arg->getType() != II->getType() &&
1318 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1319 return false;
1320 }
1321
1322 // Do not convert the vector condition of a vector select into a scalar
1323 // condition. That may cause problems for codegen because of differences in
1324 // boolean formats and register-file transfers.
1325 // TODO: Can we account for that in the cost model?
1326 if (CI)
1327 for (User *U : I.users())
1328 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1329 return false;
1330
1331 // Match constant vectors or scalars being inserted into constant vectors:
1332 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1333 SmallVector<Value *> VecCs, ScalarOps;
1334 std::optional<uint64_t> Index;
1335
1336 auto Ops = II ? II->args() : I.operands();
1337 for (auto [OpNum, Op] : enumerate(Ops)) {
1338 Constant *VecC;
1339 Value *V;
1340 uint64_t InsIdx = 0;
1341 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1342 m_ConstantInt(InsIdx)))) {
1343 // Bail if any inserts are out of bounds.
1344 VectorType *OpTy = cast<VectorType>(Op->getType());
1345 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1346 return false;
1347 // All inserts must have the same index.
1348 // TODO: Deal with mismatched index constants and variable indexes?
1349 if (!Index)
1350 Index = InsIdx;
1351 else if (InsIdx != *Index)
1352 return false;
1353 VecCs.push_back(VecC);
1354 ScalarOps.push_back(V);
1355 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1356 OpNum, &TTI)) {
1357 VecCs.push_back(Op.get());
1358 ScalarOps.push_back(Op.get());
1359 } else if (match(Op.get(), m_Constant(VecC))) {
1360 VecCs.push_back(VecC);
1361 ScalarOps.push_back(nullptr);
1362 } else {
1363 return false;
1364 }
1365 }
1366
1367 // Bail if all operands are constant.
1368 if (!Index.has_value())
1369 return false;
1370
1371 VectorType *VecTy = cast<VectorType>(I.getType());
1372 Type *ScalarTy = VecTy->getScalarType();
1373 assert(VecTy->isVectorTy() &&
1374 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1375 ScalarTy->isPointerTy()) &&
1376 "Unexpected types for insert element into binop or cmp");
1377
1378 unsigned Opcode = I.getOpcode();
1379 InstructionCost ScalarOpCost, VectorOpCost;
1380 if (CI) {
1381 CmpInst::Predicate Pred = CI->getPredicate();
1382 ScalarOpCost = TTI.getCmpSelInstrCost(
1383 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1384 VectorOpCost = TTI.getCmpSelInstrCost(
1385 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1386 } else if (UO || BO) {
1387 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1388 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1389 } else {
1390 IntrinsicCostAttributes ScalarICA(
1391 II->getIntrinsicID(), ScalarTy,
1392 SmallVector<Type *>(II->arg_size(), ScalarTy));
1393 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1394 IntrinsicCostAttributes VectorICA(
1395 II->getIntrinsicID(), VecTy,
1396 SmallVector<Type *>(II->arg_size(), VecTy));
1397 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1398 }
1399
1400 // Fold the vector constants in the original vectors into a new base vector to
1401 // get more accurate cost modelling.
1402 Value *NewVecC = nullptr;
1403 if (CI)
1404 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1405 else if (UO)
1406 NewVecC =
1407 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1408 else if (BO)
1409 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1410 else if (II)
1411 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1412
1413 if (!NewVecC)
1414 return false;
1415
1416 // Get cost estimate for the insert element. This cost will factor into
1417 // both sequences.
1418 InstructionCost OldCost = VectorOpCost;
1419 InstructionCost NewCost =
1420 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1421 CostKind, *Index, NewVecC);
1422
1423 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1424 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1425 II->getIntrinsicID(), Idx, &TTI)))
1426 continue;
1428 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1429 OldCost += InsertCost;
1430 NewCost += !Op->hasOneUse() * InsertCost;
1431 }
1432
1433 // We want to scalarize unless the vector variant actually has lower cost.
1434 if (OldCost < NewCost || !NewCost.isValid())
1435 return false;
1436
1437 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1438 // inselt NewVecC, (scalar_op V0, V1), Index
1439 if (CI)
1440 ++NumScalarCmp;
1441 else if (UO || BO)
1442 ++NumScalarOps;
1443 else
1444 ++NumScalarIntrinsic;
1445
1446 // For constant cases, extract the scalar element, this should constant fold.
1447 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1448 if (!Scalar)
1450 cast<Constant>(VecC), Builder.getInt64(*Index));
1451
1452 Value *Scalar;
1453 if (CI)
1454 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1455 else if (UO || BO)
1456 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1457 else
1458 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1459
1460 Scalar->setName(I.getName() + ".scalar");
1461
1462 // All IR flags are safe to back-propagate. There is no potential for extra
1463 // poison to be created by the scalar instruction.
1464 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1465 ScalarInst->copyIRFlags(&I);
1466
1467 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1468 replaceValue(I, *Insert);
1469 return true;
1470}
1471
1472/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1473/// a vector into vector operations followed by extract. Note: The SLP pass
1474/// may miss this pattern because of implementation problems.
1475bool VectorCombine::foldExtractedCmps(Instruction &I) {
1476 auto *BI = dyn_cast<BinaryOperator>(&I);
1477
1478 // We are looking for a scalar binop of booleans.
1479 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1480 if (!BI || !I.getType()->isIntegerTy(1))
1481 return false;
1482
1483 // The compare predicates should match, and each compare should have a
1484 // constant operand.
1485 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1486 Instruction *I0, *I1;
1487 Constant *C0, *C1;
1488 CmpPredicate P0, P1;
1489 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1490 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1491 return false;
1492
1493 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1494 if (!MatchingPred)
1495 return false;
1496
1497 // The compare operands must be extracts of the same vector with constant
1498 // extract indexes.
1499 Value *X;
1500 uint64_t Index0, Index1;
1501 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1502 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1503 return false;
1504
1505 auto *Ext0 = cast<ExtractElementInst>(I0);
1506 auto *Ext1 = cast<ExtractElementInst>(I1);
1507 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1508 if (!ConvertToShuf)
1509 return false;
1510 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1511 "Unknown ExtractElementInst");
1512
1513 // The original scalar pattern is:
1514 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1515 CmpInst::Predicate Pred = *MatchingPred;
1516 unsigned CmpOpcode =
1517 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1518 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1519 if (!VecTy)
1520 return false;
1521
1522 if (Index0 >= VecTy->getNumElements() || Index1 >= VecTy->getNumElements())
1523 return false;
1524
1525 InstructionCost Ext0Cost =
1526 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1527 InstructionCost Ext1Cost =
1528 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1530 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1531 CostKind);
1532
1533 InstructionCost OldCost =
1534 Ext0Cost + Ext1Cost + CmpCost * 2 +
1535 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1536
1537 // The proposed vector pattern is:
1538 // vcmp = cmp Pred X, VecC
1539 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1540 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1541 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1544 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1545 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1546 ShufMask[CheapIndex] = ExpensiveIndex;
1548 CmpTy, ShufMask, CostKind);
1549 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1550 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1551 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1552 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1553
1554 // Aggressively form vector ops if the cost is equal because the transform
1555 // may enable further optimization.
1556 // Codegen can reverse this transform (scalarize) if it was not profitable.
1557 if (OldCost < NewCost || !NewCost.isValid())
1558 return false;
1559
1560 // Create a vector constant from the 2 scalar constants.
1561 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1562 PoisonValue::get(VecTy->getElementType()));
1563 CmpC[Index0] = C0;
1564 CmpC[Index1] = C1;
1565 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1566 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1567 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1568 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1569 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1570 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1571 replaceValue(I, *NewExt);
1572 ++NumVecCmpBO;
1573 return true;
1574}
1575
1576/// Try to fold scalar selects that select between extracted elements and zero
1577/// into extracting from a vector select. This is rooted at the bitcast.
1578///
1579/// This pattern arises when a vector is bitcast to a smaller element type,
1580/// elements are extracted, and then conditionally selected with zero:
1581///
1582/// %bc = bitcast <4 x i32> %src to <16 x i8>
1583/// %e0 = extractelement <16 x i8> %bc, i32 0
1584/// %s0 = select i1 %cond, i8 %e0, i8 0
1585/// %e1 = extractelement <16 x i8> %bc, i32 1
1586/// %s1 = select i1 %cond, i8 %e1, i8 0
1587/// ...
1588///
1589/// Transforms to:
1590/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1591/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1592/// %e0 = extractelement <16 x i8> %bc, i32 0
1593/// %e1 = extractelement <16 x i8> %bc, i32 1
1594/// ...
1595///
1596/// This is profitable because vector select on wider types produces fewer
1597/// select/cndmask instructions than scalar selects on each element.
1598bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1599 auto *BC = dyn_cast<BitCastInst>(&I);
1600 if (!BC)
1601 return false;
1602
1603 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1604 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1605 if (!SrcVecTy || !DstVecTy)
1606 return false;
1607
1608 // Source must be 32-bit or 64-bit elements, destination must be smaller
1609 // integer elements. Zero in all these types is all-bits-zero.
1610 Type *SrcEltTy = SrcVecTy->getElementType();
1611 Type *DstEltTy = DstVecTy->getElementType();
1612 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1613 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1614
1615 if (SrcEltBits != 32 && SrcEltBits != 64)
1616 return false;
1617
1618 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1619 return false;
1620
1621 // Check profitability using TTI before collecting users.
1622 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1623 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1624
1625 InstructionCost ScalarSelCost =
1626 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1628 InstructionCost VecSelCost =
1629 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1631
1632 // We need at least this many selects for vectorization to be profitable.
1633 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1634 // ScalarSelCost
1635 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1636 return false;
1637
1638 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1639
1640 // Quick check: if bitcast doesn't have enough users, bail early.
1641 if (!BC->hasNUsesOrMore(MinSelects))
1642 return false;
1643
1644 // Collect all select users that match the pattern, grouped by condition.
1645 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1646 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1647
1648 for (User *U : BC->users()) {
1649 auto *Ext = dyn_cast<ExtractElementInst>(U);
1650 if (!Ext)
1651 continue;
1652
1653 for (User *ExtUser : Ext->users()) {
1654 Value *Cond;
1655 // Match: select i1 %cond, %ext, 0
1656 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1657 Cond->getType()->isIntegerTy(1))
1658 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1659 }
1660 }
1661
1662 if (CondToSelects.empty())
1663 return false;
1664
1665 bool MadeChange = false;
1666 Value *SrcVec = BC->getOperand(0);
1667
1668 // Process each group of selects with the same condition.
1669 for (auto [Cond, Selects] : CondToSelects) {
1670 // Only profitable if vector select cost < total scalar select cost.
1671 if (Selects.size() < MinSelects) {
1672 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1673 << "profitable (VecCost=" << VecSelCost
1674 << ", ScalarCost=" << ScalarSelCost
1675 << ", NumSelects=" << Selects.size() << ")\n");
1676 continue;
1677 }
1678
1679 // Create the vector select and bitcast once for this condition.
1680 auto InsertPt = std::next(BC->getIterator());
1681
1682 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1683 if (DT.dominates(BC, CondInst))
1684 InsertPt = std::next(CondInst->getIterator());
1685
1686 Builder.SetInsertPoint(InsertPt);
1687 Value *VecSel =
1688 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1689 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1690
1691 // Replace each scalar select with an extract from the new bitcast.
1692 for (SelectInst *Sel : Selects) {
1693 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1694 Value *Idx = Ext->getIndexOperand();
1695
1696 Builder.SetInsertPoint(Sel);
1697 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1698 replaceValue(*Sel, *NewExt);
1699 MadeChange = true;
1700 }
1701
1702 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1703 << " selects into vector select\n");
1704 }
1705
1706 return MadeChange;
1707}
1708
1711 const TargetTransformInfo &TTI,
1712 InstructionCost &CostBeforeReduction,
1713 InstructionCost &CostAfterReduction) {
1714 Instruction *Op0, *Op1;
1715 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1716 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1717 unsigned ReductionOpc =
1718 getArithmeticReductionInstruction(II.getIntrinsicID());
1719 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1720 bool IsUnsigned = isa<ZExtInst>(RedOp);
1721 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1722
1723 CostBeforeReduction =
1724 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1726 CostAfterReduction =
1727 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1728 ExtType, FastMathFlags(), CostKind);
1729 return;
1730 }
1731 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1732 match(RedOp,
1734 match(Op0, m_ZExtOrSExt(m_Value())) &&
1735 Op0->getOpcode() == Op1->getOpcode() &&
1736 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1737 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1738 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1739 bool IsUnsigned = isa<ZExtInst>(Op0);
1740 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1741 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1742
1743 InstructionCost ExtCost =
1744 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1746 InstructionCost MulCost =
1747 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1748 InstructionCost Ext2Cost =
1749 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1751
1752 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1753 CostAfterReduction = TTI.getMulAccReductionCost(
1754 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1755 return;
1756 }
1757 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1758 std::nullopt, CostKind);
1759}
1760
1761bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1762 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1763 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1764 if (BinOpOpc == Instruction::Sub)
1765 ReductionIID = Intrinsic::vector_reduce_add;
1766 if (ReductionIID == Intrinsic::not_intrinsic)
1767 return false;
1768 // FP reductions have a start-value operand that this fold doesn't handle.
1769 if (ReductionIID == Intrinsic::vector_reduce_fadd ||
1770 ReductionIID == Intrinsic::vector_reduce_fmul)
1771 return false;
1772
1773 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1774 Intrinsic::ID IID) -> Value * {
1775 auto *II = dyn_cast<IntrinsicInst>(V);
1776 if (!II)
1777 return nullptr;
1778 if (II->getIntrinsicID() == IID && II->hasOneUse())
1779 return II->getArgOperand(0);
1780 return nullptr;
1781 };
1782
1783 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1784 if (!V0)
1785 return false;
1786 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1787 if (!V1)
1788 return false;
1789
1790 auto *VTy = cast<VectorType>(V0->getType());
1791 if (V1->getType() != VTy)
1792 return false;
1793 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1794 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1795 unsigned ReductionOpc =
1796 getArithmeticReductionInstruction(II0.getIntrinsicID());
1797
1798 InstructionCost OldCost = 0;
1799 InstructionCost NewCost = 0;
1800 InstructionCost CostOfRedOperand0 = 0;
1801 InstructionCost CostOfRed0 = 0;
1802 InstructionCost CostOfRedOperand1 = 0;
1803 InstructionCost CostOfRed1 = 0;
1804 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1805 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1806 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1807 NewCost =
1808 CostOfRedOperand0 + CostOfRedOperand1 +
1809 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1810 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1811 if (NewCost >= OldCost || !NewCost.isValid())
1812 return false;
1813
1814 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1815 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1816 << "\n");
1817 Value *VectorBO;
1818 if (BinOpOpc == Instruction::Or)
1819 VectorBO = Builder.CreateOr(V0, V1, "",
1820 cast<PossiblyDisjointInst>(I).isDisjoint());
1821 else
1822 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1823
1824 Value *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1825 replaceValue(I, *Rdx);
1826 return true;
1827}
1828
1829// Check if memory loc modified between two instrs in the same BB
1832 const MemoryLocation &Loc, AAResults &AA) {
1833 unsigned NumScanned = 0;
1834 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1835 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1836 ++NumScanned > MaxInstrsToScan;
1837 });
1838}
1839
1840namespace {
1841/// Helper class to indicate whether a vector index can be safely scalarized and
1842/// if a freeze needs to be inserted.
1843class ScalarizationResult {
1844 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1845
1846 StatusTy Status;
1847 Value *ToFreeze;
1848
1849 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1850 : Status(Status), ToFreeze(ToFreeze) {}
1851
1852public:
1853 ScalarizationResult(const ScalarizationResult &Other) = default;
1854 ~ScalarizationResult() {
1855 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1856 }
1857
1858 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1859 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1860 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1861 return {StatusTy::SafeWithFreeze, ToFreeze};
1862 }
1863
1864 /// Returns true if the index can be scalarize without requiring a freeze.
1865 bool isSafe() const { return Status == StatusTy::Safe; }
1866 /// Returns true if the index cannot be scalarized.
1867 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1868 /// Returns true if the index can be scalarize, but requires inserting a
1869 /// freeze.
1870 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1871
1872 /// Reset the state of Unsafe and clear ToFreze if set.
1873 void discard() {
1874 ToFreeze = nullptr;
1875 Status = StatusTy::Unsafe;
1876 }
1877
1878 /// Freeze the ToFreeze and update the use in \p User to use it.
1879 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1880 assert(isSafeWithFreeze() &&
1881 "should only be used when freezing is required");
1882 assert(is_contained(ToFreeze->users(), &UserI) &&
1883 "UserI must be a user of ToFreeze");
1884 IRBuilder<>::InsertPointGuard Guard(Builder);
1885 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1886 Value *Frozen =
1887 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1888 for (Use &U : make_early_inc_range((UserI.operands())))
1889 if (U.get() == ToFreeze)
1890 U.set(Frozen);
1891
1892 ToFreeze = nullptr;
1893 }
1894};
1895} // namespace
1896
1897/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1898/// Idx. \p Idx must access a valid vector element.
1899static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1900 const SimplifyQuery &SQ) {
1901 // We do checks for both fixed vector types and scalable vector types.
1902 // This is the number of elements of fixed vector types,
1903 // or the minimum number of elements of scalable vector types.
1904 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1905 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1906
1907 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1908 if (C->getValue().ult(NumElements))
1909 return ScalarizationResult::safe();
1910 return ScalarizationResult::unsafe();
1911 }
1912
1913 // Always unsafe if the index type can't handle all inbound values.
1914 if (!llvm::isUIntN(IntWidth, NumElements))
1915 return ScalarizationResult::unsafe();
1916
1917 APInt Zero(IntWidth, 0);
1918 APInt MaxElts(IntWidth, NumElements);
1919 ConstantRange ValidIndices(Zero, MaxElts);
1920 ConstantRange IdxRange(IntWidth, true);
1921
1922 if (isGuaranteedNotToBePoison(Idx, SQ.AC, SQ.CxtI, SQ.DT)) {
1923 if (ValidIndices.contains(
1924 computeConstantRange(Idx, /*ForSigned=*/false, SQ)))
1925 return ScalarizationResult::safe();
1926 return ScalarizationResult::unsafe();
1927 }
1928
1929 // If the index may be poison, check if we can insert a freeze before the
1930 // range of the index is restricted.
1931 Value *IdxBase;
1932 ConstantInt *CI;
1933 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1934 IdxRange = IdxRange.binaryAnd(CI->getValue());
1935 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1936 IdxRange = IdxRange.urem(CI->getValue());
1937 }
1938
1939 if (ValidIndices.contains(IdxRange))
1940 return ScalarizationResult::safeWithFreeze(IdxBase);
1941 return ScalarizationResult::unsafe();
1942}
1943
1944/// The memory operation on a vector of \p ScalarType had alignment of
1945/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1946/// alignment that will be valid for the memory operation on a single scalar
1947/// element of the same type with index \p Idx.
1949 Type *ScalarType, Value *Idx,
1950 const DataLayout &DL) {
1951 if (auto *C = dyn_cast<ConstantInt>(Idx))
1952 return commonAlignment(VectorAlignment,
1953 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1954 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1955}
1956
1957// Combine patterns like:
1958// %0 = load <4 x i32>, <4 x i32>* %a
1959// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1960// store <4 x i32> %1, <4 x i32>* %a
1961// to:
1962// %0 = bitcast <4 x i32>* %a to i32*
1963// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1964// store i32 %b, i32* %1
1965bool VectorCombine::foldSingleElementStore(Instruction &I) {
1967 return false;
1968 auto *SI = cast<StoreInst>(&I);
1969 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1970 return false;
1971
1972 // TODO: Combine more complicated patterns (multiple insert) by referencing
1973 // TargetTransformInfo.
1975 Value *NewElement;
1976 Value *Idx;
1977 if (!match(SI->getValueOperand(),
1978 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1979 m_Value(Idx))))
1980 return false;
1981
1982 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1983 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1984 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1985 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1986 // modified between, vector type matches store size, and index is inbounds.
1987 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1988 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1989 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1990 return false;
1991
1992 auto ScalarizableIdx =
1993 canScalarizeAccess(VecTy, Idx, SQ.getWithInstruction(Load));
1994 if (ScalarizableIdx.isUnsafe() ||
1995 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1996 MemoryLocation::get(SI), AA))
1997 return false;
1998
1999 // Ensure we add the load back to the worklist BEFORE its users so they can
2000 // erased in the correct order.
2001 Worklist.push(Load);
2002
2003 if (ScalarizableIdx.isSafeWithFreeze())
2004 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
2005 Value *GEP = Builder.CreateInBoundsGEP(
2006 SI->getValueOperand()->getType(), SI->getPointerOperand(),
2007 {ConstantInt::get(Idx->getType(), 0), Idx});
2008 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
2009 NSI->copyMetadata(*SI);
2010 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
2011 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
2012 *DL);
2013 NSI->setAlignment(ScalarOpAlignment);
2014 replaceValue(I, *NSI);
2016 return true;
2017 }
2018
2019 return false;
2020}
2021
2022/// Try to scalarize vector loads feeding extractelement or bitcast
2023/// instructions.
2024bool VectorCombine::scalarizeLoad(Instruction &I) {
2025 Value *Ptr;
2026 if (!match(&I, m_Load(m_Value(Ptr))))
2027 return false;
2028
2029 auto *LI = cast<LoadInst>(&I);
2030 auto *VecTy = cast<VectorType>(LI->getType());
2031
2032 // The isSimple() check could be isUnordered(), but for now we cowardly
2033 // refuse to handle even unordered atomics.
2034 if (!LI->isSimple() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2035 return false;
2036
2037 bool AllExtracts = true;
2038 bool AllBitcasts = true;
2039 Instruction *LastCheckedInst = LI;
2040 unsigned NumInstChecked = 0;
2041
2042 // Check what type of users we have (must either all be extracts or
2043 // bitcasts) and ensure no memory modifications between the load and
2044 // its users.
2045 for (User *U : LI->users()) {
2046 auto *UI = dyn_cast<Instruction>(U);
2047 if (!UI || UI->getParent() != LI->getParent())
2048 return false;
2049
2050 // If any user is waiting to be erased, then bail out as this will
2051 // distort the cost calculation and possibly lead to infinite loops.
2052 if (UI->use_empty())
2053 return false;
2054
2055 if (!isa<ExtractElementInst>(UI))
2056 AllExtracts = false;
2057 if (!isa<BitCastInst>(UI))
2058 AllBitcasts = false;
2059
2060 // Check if any instruction between the load and the user may modify memory.
2061 if (LastCheckedInst->comesBefore(UI)) {
2062 for (Instruction &I :
2063 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2064 // Bail out if we reached the check limit or the instruction may write
2065 // to memory.
2066 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2067 return false;
2068 NumInstChecked++;
2069 }
2070 LastCheckedInst = UI;
2071 }
2072 }
2073
2074 if (AllExtracts)
2075 return scalarizeLoadExtract(LI, VecTy, Ptr);
2076 if (AllBitcasts)
2077 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2078 return false;
2079}
2080
2081/// Try to scalarize vector loads feeding extractelement instructions.
2082bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2083 Value *Ptr) {
2085 return false;
2086
2087 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2088 llvm::scope_exit FailureGuard([&]() {
2089 // If the transform is aborted, discard the ScalarizationResults.
2090 for (auto &Pair : NeedFreeze)
2091 Pair.second.discard();
2092 });
2093
2094 InstructionCost OriginalCost =
2095 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2097 InstructionCost ScalarizedCost = 0;
2098
2099 for (User *U : LI->users()) {
2100 auto *UI = cast<ExtractElementInst>(U);
2101
2102 auto ScalarIdx = canScalarizeAccess(VecTy, UI->getIndexOperand(),
2103 SQ.getWithInstruction(LI));
2104 if (ScalarIdx.isUnsafe())
2105 return false;
2106 if (ScalarIdx.isSafeWithFreeze()) {
2107 NeedFreeze.try_emplace(UI, ScalarIdx);
2108 ScalarIdx.discard();
2109 }
2110
2111 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2112 OriginalCost +=
2113 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2114 Index ? Index->getZExtValue() : -1);
2115 ScalarizedCost +=
2116 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2118 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2119 nullptr, nullptr, CostKind);
2120 }
2121
2122 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2123 << "\n LoadExtractCost: " << OriginalCost
2124 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2125
2126 if (ScalarizedCost >= OriginalCost)
2127 return false;
2128
2129 // Ensure we add the load back to the worklist BEFORE its users so they can
2130 // erased in the correct order.
2131 Worklist.push(LI);
2132
2133 Type *ElemType = VecTy->getElementType();
2134
2135 // Replace extracts with narrow scalar loads.
2136 for (User *U : LI->users()) {
2137 auto *EI = cast<ExtractElementInst>(U);
2138 Value *Idx = EI->getIndexOperand();
2139
2140 // Insert 'freeze' for poison indexes.
2141 auto It = NeedFreeze.find(EI);
2142 if (It != NeedFreeze.end())
2143 It->second.freeze(Builder, *cast<Instruction>(Idx));
2144
2145 Builder.SetInsertPoint(EI);
2146 Value *GEP =
2147 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2148 auto *NewLoad = cast<LoadInst>(
2149 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2150
2151 Align ScalarOpAlignment =
2152 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2153 NewLoad->setAlignment(ScalarOpAlignment);
2154
2155 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2156 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2157 AAMDNodes OldAAMD = LI->getAAMetadata();
2158 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2159 }
2160
2161 replaceValue(*EI, *NewLoad, false);
2162 }
2163
2164 FailureGuard.release();
2165 return true;
2166}
2167
2168/// Try to scalarize vector loads feeding bitcast instructions.
2169bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2170 Value *Ptr) {
2171 InstructionCost OriginalCost =
2172 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2174
2175 Type *TargetScalarType = nullptr;
2176 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2177
2178 for (User *U : LI->users()) {
2179 auto *BC = cast<BitCastInst>(U);
2180
2181 Type *DestTy = BC->getDestTy();
2182 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2183 return false;
2184
2185 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2186 if (DestBitWidth != VecBitWidth)
2187 return false;
2188
2189 // All bitcasts must target the same scalar type.
2190 if (!TargetScalarType)
2191 TargetScalarType = DestTy;
2192 else if (TargetScalarType != DestTy)
2193 return false;
2194
2195 OriginalCost +=
2196 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2198 }
2199
2200 if (!TargetScalarType)
2201 return false;
2202
2203 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2204 InstructionCost ScalarizedCost =
2205 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2207
2208 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2209 << "\n OriginalCost: " << OriginalCost
2210 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2211
2212 if (ScalarizedCost >= OriginalCost)
2213 return false;
2214
2215 // Ensure we add the load back to the worklist BEFORE its users so they can
2216 // erased in the correct order.
2217 Worklist.push(LI);
2218
2219 Builder.SetInsertPoint(LI);
2220 auto *ScalarLoad =
2221 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2222 ScalarLoad->setAlignment(LI->getAlign());
2223 ScalarLoad->copyMetadata(*LI);
2224
2225 // Replace all bitcast users with the scalar load.
2226 for (User *U : LI->users()) {
2227 auto *BC = cast<BitCastInst>(U);
2228 replaceValue(*BC, *ScalarLoad, false);
2229 }
2230
2231 return true;
2232}
2233
2234bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2236 return false;
2237 auto *Ext = dyn_cast<ZExtInst>(&I);
2238 if (!Ext)
2239 return false;
2240
2241 // Try to convert a vector zext feeding only extracts to a set of scalar
2242 // (Src << ExtIdx *Size) & (Size -1)
2243 // if profitable .
2244 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2245 if (!SrcTy)
2246 return false;
2247 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2248
2249 Type *ScalarDstTy = DstTy->getElementType();
2250 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2251 return false;
2252
2253 InstructionCost VectorCost =
2254 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2256 unsigned ExtCnt = 0;
2257 bool ExtLane0 = false;
2258 for (User *U : Ext->users()) {
2259 uint64_t Idx;
2260 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2261 return false;
2262 if (cast<Instruction>(U)->use_empty())
2263 continue;
2264 ExtCnt += 1;
2265 ExtLane0 |= !Idx;
2266 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2267 CostKind, Idx, U);
2268 }
2269
2270 InstructionCost ScalarCost =
2271 ExtCnt * TTI.getArithmeticInstrCost(
2272 Instruction::And, ScalarDstTy, CostKind,
2275 (ExtCnt - ExtLane0) *
2277 Instruction::LShr, ScalarDstTy, CostKind,
2280 if (ScalarCost > VectorCost)
2281 return false;
2282
2283 Value *ScalarV = Ext->getOperand(0);
2284 if (!isGuaranteedNotToBePoison(ScalarV, SQ.AC, dyn_cast<Instruction>(ScalarV),
2285 SQ.DT)) {
2286 // Check wether all lanes are extracted, all extracts trigger UB
2287 // on poison, and the last extract (and hence all previous ones)
2288 // are guaranteed to execute if Ext executes. If so, we do not
2289 // need to insert a freeze.
2290 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2291 bool AllExtractsTriggerUB = true;
2292 ExtractElementInst *LastExtract = nullptr;
2293 BasicBlock *ExtBB = Ext->getParent();
2294 for (User *U : Ext->users()) {
2295 auto *Extract = cast<ExtractElementInst>(U);
2296 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2297 AllExtractsTriggerUB = false;
2298 break;
2299 }
2300 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2301 if (!LastExtract || LastExtract->comesBefore(Extract))
2302 LastExtract = Extract;
2303 }
2304 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2305 !AllExtractsTriggerUB ||
2307 LastExtract->getIterator()))
2308 ScalarV = Builder.CreateFreeze(ScalarV);
2309 }
2310 ScalarV = Builder.CreateBitCast(
2311 ScalarV,
2312 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2313 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2314 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2315 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2316 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2317 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2318 for (User *U : Ext->users()) {
2319 auto *Extract = cast<ExtractElementInst>(U);
2320 uint64_t Idx =
2321 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2322 uint64_t ShiftAmt =
2323 DL->isBigEndian()
2324 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2325 : (Idx * SrcEltSizeInBits);
2326 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2327 Value *And = Builder.CreateAnd(LShr, Mask);
2328 U->replaceAllUsesWith(And);
2329 }
2330 return true;
2331}
2332
2333/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2334/// to "(bitcast (concat X, Y))"
2335/// where X/Y are bitcasted from i1 mask vectors.
2336bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2337 Type *Ty = I.getType();
2338 if (!Ty->isIntegerTy())
2339 return false;
2340
2341 // TODO: Add big endian test coverage
2342 if (DL->isBigEndian())
2343 return false;
2344
2345 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2346 Instruction *X, *Y;
2348 return false;
2349
2350 // Allow both sources to contain shl, to handle more generic pattern:
2351 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2352 Value *SrcX;
2353 uint64_t ShAmtX = 0;
2354 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2355 !match(X, m_OneUse(
2357 m_ConstantInt(ShAmtX)))))
2358 return false;
2359
2360 Value *SrcY;
2361 uint64_t ShAmtY = 0;
2362 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2363 !match(Y, m_OneUse(
2365 m_ConstantInt(ShAmtY)))))
2366 return false;
2367
2368 // Canonicalize larger shift to the RHS.
2369 if (ShAmtX > ShAmtY) {
2370 std::swap(X, Y);
2371 std::swap(SrcX, SrcY);
2372 std::swap(ShAmtX, ShAmtY);
2373 }
2374
2375 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2376 // difference is the mask width so they can be easily concatenated together.
2377 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2378 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2379 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2380 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2381 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2382 !MaskTy->getElementType()->isIntegerTy(1) ||
2383 MaskTy->getNumElements() != ShAmtDiff ||
2384 MaskTy->getNumElements() > (BitWidth / 2))
2385 return false;
2386
2387 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2388 auto *ConcatIntTy =
2389 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2390 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2391
2392 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2393 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2394
2395 // TODO: Is it worth supporting multi use cases?
2396 InstructionCost OldCost = 0;
2397 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2398 OldCost +=
2399 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2400 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2402 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2404
2405 InstructionCost NewCost = 0;
2407 MaskTy, ConcatMask, CostKind);
2408 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2410 if (Ty != ConcatIntTy)
2411 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2413 if (ShAmtX > 0)
2414 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2415
2416 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2417 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2418 << "\n");
2419
2420 if (NewCost > OldCost)
2421 return false;
2422
2423 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2424 // any residual zero-extension or shifting.
2425 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2426 Worklist.pushValue(Concat);
2427
2428 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2429
2430 if (Ty != ConcatIntTy) {
2431 Worklist.pushValue(Result);
2432 Result = Builder.CreateZExt(Result, Ty);
2433 }
2434
2435 if (ShAmtX > 0) {
2436 Worklist.pushValue(Result);
2437 Result = Builder.CreateShl(Result, ShAmtX);
2438 }
2439
2440 replaceValue(I, *Result);
2441 return true;
2442}
2443
2444/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2445/// --> "binop (shuffle), (shuffle)".
2446bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2447 BinaryOperator *BinOp;
2448 ArrayRef<int> OuterMask;
2449 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2450 return false;
2451
2452 // Don't introduce poison into div/rem.
2453 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2454 return false;
2455
2456 Value *Op00, *Op01, *Op10, *Op11;
2457 ArrayRef<int> Mask0, Mask1;
2458 bool Match0 = match(BinOp->getOperand(0),
2459 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2460 bool Match1 = match(BinOp->getOperand(1),
2461 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2462 if (!Match0 && !Match1)
2463 return false;
2464
2465 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2466 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2467 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2468 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2469
2470 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2471 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2472 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2473 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2474 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2475 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2476 return false;
2477
2478 unsigned NumSrcElts = BinOpTy->getNumElements();
2479
2480 // Don't accept shuffles that reference the second operand in
2481 // div/rem or if its an undef arg.
2482 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2483 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2484 return false;
2485
2486 // Merge outer / inner (or identity if no match) shuffles.
2487 SmallVector<int> NewMask0, NewMask1;
2488 for (int M : OuterMask) {
2489 if (M < 0 || M >= (int)NumSrcElts) {
2490 NewMask0.push_back(PoisonMaskElem);
2491 NewMask1.push_back(PoisonMaskElem);
2492 } else {
2493 NewMask0.push_back(Match0 ? Mask0[M] : M);
2494 NewMask1.push_back(Match1 ? Mask1[M] : M);
2495 }
2496 }
2497
2498 unsigned NumOpElts = Op0Ty->getNumElements();
2499 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2500 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2501 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2502 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2503 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2504 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2505
2506 InstructionCost NewCost = 0;
2507 // Try to merge shuffles across the binop if the new shuffles are not costly.
2508 InstructionCost BinOpCost =
2509 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2510 InstructionCost OldCost =
2512 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2513 0, nullptr, {BinOp}, &I);
2514 if (!BinOp->hasOneUse())
2515 NewCost += BinOpCost;
2516
2517 if (Match0) {
2519 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2520 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2521 OldCost += Shuf0Cost;
2522 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2523 NewCost += Shuf0Cost;
2524 }
2525 if (Match1) {
2527 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2528 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2529 OldCost += Shuf1Cost;
2530 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2531 NewCost += Shuf1Cost;
2532 }
2533
2534 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2535
2536 if (!IsIdentity0)
2537 NewCost +=
2539 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2540 if (!IsIdentity1)
2541 NewCost +=
2543 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2544
2545 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2546 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2547 << "\n");
2548
2549 // If costs are equal, still fold as we reduce instruction count.
2550 if (NewCost > OldCost)
2551 return false;
2552
2553 Value *LHS =
2554 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2555 Value *RHS =
2556 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2557 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2558
2559 // Intersect flags from the old binops.
2560 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2561 NewInst->copyIRFlags(BinOp);
2562
2563 Worklist.pushValue(LHS);
2564 Worklist.pushValue(RHS);
2565 replaceValue(I, *NewBO);
2566 return true;
2567}
2568
2569/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2570/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2571bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2572 ArrayRef<int> OldMask;
2573 Instruction *LHS, *RHS;
2575 m_Mask(OldMask))))
2576 return false;
2577
2578 // TODO: Add support for addlike etc.
2579 if (LHS->getOpcode() != RHS->getOpcode())
2580 return false;
2581
2582 Value *X, *Y, *Z, *W;
2583 bool IsCommutative = false;
2584 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2585 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2586 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2587 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2588 auto *BO = cast<BinaryOperator>(LHS);
2589 // Don't introduce poison into div/rem.
2590 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2591 return false;
2592 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2593 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2594 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2595 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2596 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2597 } else
2598 return false;
2599
2600 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2601 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2602 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2603 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2604 return false;
2605
2606 bool SameBinOp = LHS == RHS;
2607 unsigned NumSrcElts = BinOpTy->getNumElements();
2608
2609 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2610 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2611 std::swap(X, Y);
2612
2613 auto ConvertToUnary = [NumSrcElts](int &M) {
2614 if (M >= (int)NumSrcElts)
2615 M -= NumSrcElts;
2616 };
2617
2618 SmallVector<int> NewMask0(OldMask);
2620 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2621 if (X == Z) {
2622 llvm::for_each(NewMask0, ConvertToUnary);
2624 Z = PoisonValue::get(BinOpTy);
2625 }
2626
2627 SmallVector<int> NewMask1(OldMask);
2629 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2630 if (Y == W) {
2631 llvm::for_each(NewMask1, ConvertToUnary);
2633 W = PoisonValue::get(BinOpTy);
2634 }
2635
2636 // Try to replace a binop with a shuffle if the shuffle is not costly.
2637 // When SameBinOp, only count the binop cost once.
2640
2641 InstructionCost OldCost = LHSCost;
2642 if (!SameBinOp) {
2643 OldCost += RHSCost;
2644 }
2646 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2647 nullptr, {LHS, RHS}, &I);
2648
2649 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2650 // where one use shuffles have gotten split across the binop/cmp. These
2651 // often allow a major reduction in total cost that wouldn't happen as
2652 // individual folds.
2653 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2654 TTI::TargetCostKind CostKind) -> bool {
2655 Value *InnerOp;
2656 ArrayRef<int> InnerMask;
2657 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2658 m_Mask(InnerMask)))) &&
2659 InnerOp->getType() == Op->getType() &&
2660 all_of(InnerMask,
2661 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2662 for (int &M : Mask)
2663 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2664 M = InnerMask[M - Offset];
2665 M = 0 <= M ? M + Offset : M;
2666 }
2668 Op = InnerOp;
2669 return true;
2670 }
2671 return false;
2672 };
2673 bool ReducedInstCount = false;
2674 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2675 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2676 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2677 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2678 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2679 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2680 // original binop(s). If binops have multiple uses, they won't be eliminated.
2681 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2682
2683 auto *ShuffleCmpTy =
2684 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2686 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2687 if (!SingleSrcBinOp)
2688 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2689 CostKind, 0, nullptr, {Y, W});
2690
2691 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2692 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2693 CostKind, Op0Info, Op1Info);
2694 } else {
2695 NewCost +=
2696 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2697 PredLHS, CostKind, Op0Info, Op1Info);
2698 }
2699 // If LHS/RHS have other uses, we need to account for the cost of keeping
2700 // the original instructions. When SameBinOp, only add the cost once.
2701 if (!LHS->hasOneUser())
2702 NewCost += LHSCost;
2703 if (!SameBinOp && !RHS->hasOneUser())
2704 NewCost += RHSCost;
2705
2706 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2707 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2708 << "\n");
2709
2710 // If either shuffle will constant fold away, then fold for the same cost as
2711 // we will reduce the instruction count.
2712 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2713 (isa<Constant>(Y) && isa<Constant>(W));
2714 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2715 return false;
2716
2717 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2718 Value *Shuf1 =
2719 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2720 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2721 ? Builder.CreateBinOp(
2722 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2723 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2724
2725 // Intersect flags from the old binops.
2726 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2727 NewInst->copyIRFlags(LHS);
2728 NewInst->andIRFlags(RHS);
2729 }
2730
2731 Worklist.pushValue(Shuf0);
2732 Worklist.pushValue(Shuf1);
2733 replaceValue(I, *NewBO);
2734 return true;
2735}
2736
2737/// Try to convert,
2738/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2739/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2740bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2741 ArrayRef<int> Mask;
2742 Value *C1, *T1, *F1, *C2, *T2, *F2;
2743 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2744 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2745 m_Mask(Mask))))
2746 return false;
2747
2748 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2749 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2750
2751 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2752 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2753 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2754 return false;
2755
2756 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2757 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2758 // SelectInsts must have the same FMF.
2759 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2760 ((SI0FOp != nullptr) &&
2761 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2762 return false;
2763
2764 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2765 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2767 auto SelOp = Instruction::Select;
2768
2770 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2772 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2773
2774 InstructionCost OldCost =
2775 CostSel1 + CostSel2 +
2776 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2777 {I.getOperand(0), I.getOperand(1)}, &I);
2778
2780 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2781 Mask, CostKind, 0, nullptr, {C1, C2});
2782 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2783 nullptr, {T1, T2});
2784 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2785 nullptr, {F1, F2});
2786 auto *C1C2ShuffledVecTy = FixedVectorType::get(
2787 Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements());
2788 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2790
2791 if (!Sel1->hasOneUse())
2792 NewCost += CostSel1;
2793 if (!Sel2->hasOneUse())
2794 NewCost += CostSel2;
2795
2796 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2797 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2798 << "\n");
2799 if (NewCost > OldCost)
2800 return false;
2801
2802 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2803 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2804 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2805 Value *NewSel;
2806 // We presuppose that the SelectInsts have the same FMF.
2807 if (SI0FOp)
2808 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2809 SI0FOp->getFastMathFlags());
2810 else
2811 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2812
2813 Worklist.pushValue(ShuffleCmp);
2814 Worklist.pushValue(ShuffleTrue);
2815 Worklist.pushValue(ShuffleFalse);
2816 replaceValue(I, *NewSel);
2817 return true;
2818}
2819
2820/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2821/// into "castop (shuffle)".
2822bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2823 Value *V0, *V1;
2824 ArrayRef<int> OldMask;
2825 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2826 return false;
2827
2828 // Check whether this is a binary shuffle.
2829 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2830
2831 auto *C0 = dyn_cast<CastInst>(V0);
2832 auto *C1 = dyn_cast<CastInst>(V1);
2833 if (!C0 || (IsBinaryShuffle && !C1))
2834 return false;
2835
2836 Instruction::CastOps Opcode = C0->getOpcode();
2837
2838 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2839 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2840 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2841 return false;
2842
2843 if (IsBinaryShuffle) {
2844 if (C0->getSrcTy() != C1->getSrcTy())
2845 return false;
2846 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2847 if (Opcode != C1->getOpcode()) {
2848 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2849 Opcode = Instruction::SExt;
2850 else
2851 return false;
2852 }
2853 }
2854
2855 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2856 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2857 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2858 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2859 return false;
2860
2861 unsigned NumSrcElts = CastSrcTy->getNumElements();
2862 unsigned NumDstElts = CastDstTy->getNumElements();
2863 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2864 "Only bitcasts expected to alter src/dst element counts");
2865
2866 // Check for bitcasting of unscalable vector types.
2867 // e.g. <32 x i40> -> <40 x i32>
2868 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2869 (NumDstElts % NumSrcElts) != 0)
2870 return false;
2871
2872 SmallVector<int, 16> NewMask;
2873 if (NumSrcElts >= NumDstElts) {
2874 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2875 // always be expanded to the equivalent form choosing narrower elements.
2876 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2877 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2878 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2879 } else {
2880 // The bitcast is from narrow elements to wide elements. The shuffle mask
2881 // must choose consecutive elements to allow casting first.
2882 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2883 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2884 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2885 return false;
2886 }
2887
2888 auto *NewShuffleDstTy =
2889 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2890
2891 // Try to replace a castop with a shuffle if the shuffle is not costly.
2892 InstructionCost CostC0 =
2893 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2895
2897 if (IsBinaryShuffle)
2899 else
2901
2902 InstructionCost OldCost = CostC0;
2903 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2904 CostKind, 0, nullptr, {}, &I);
2905
2906 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2907 CastSrcTy, NewMask, CostKind);
2908 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2910 if (!C0->hasOneUse())
2911 NewCost += CostC0;
2912 if (IsBinaryShuffle) {
2913 InstructionCost CostC1 =
2914 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2916 OldCost += CostC1;
2917 if (!C1->hasOneUse())
2918 NewCost += CostC1;
2919 }
2920
2921 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2922 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2923 << "\n");
2924 if (NewCost > OldCost)
2925 return false;
2926
2927 Value *Shuf;
2928 if (IsBinaryShuffle)
2929 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2930 NewMask);
2931 else
2932 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2933
2934 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2935
2936 // Intersect flags from the old casts.
2937 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2938 NewInst->copyIRFlags(C0);
2939 if (IsBinaryShuffle)
2940 NewInst->andIRFlags(C1);
2941 }
2942
2943 Worklist.pushValue(Shuf);
2944 replaceValue(I, *Cast);
2945 return true;
2946}
2947
2948/// Try to convert any of:
2949/// "shuffle (shuffle x, y), (shuffle y, x)"
2950/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2951/// "shuffle (shuffle x, undef), y"
2952/// "shuffle x, (shuffle y, undef)"
2953/// into "shuffle x, y".
2954bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2955 ArrayRef<int> OuterMask;
2956 Value *OuterV0, *OuterV1;
2957 if (!match(&I,
2958 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2959 return false;
2960
2961 ArrayRef<int> InnerMask0, InnerMask1;
2962 Value *X0, *X1, *Y0, *Y1;
2963 bool Match0 =
2964 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2965 bool Match1 =
2966 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2967 if (!Match0 && !Match1)
2968 return false;
2969
2970 // If the outer shuffle is a permute, then create a fake inner all-poison
2971 // shuffle. This is easier than accounting for length-changing shuffles below.
2972 SmallVector<int, 16> PoisonMask1;
2973 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2974 X1 = X0;
2975 Y1 = Y0;
2976 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2977 InnerMask1 = PoisonMask1;
2978 Match1 = true; // fake match
2979 }
2980
2981 X0 = Match0 ? X0 : OuterV0;
2982 Y0 = Match0 ? Y0 : OuterV0;
2983 X1 = Match1 ? X1 : OuterV1;
2984 Y1 = Match1 ? Y1 : OuterV1;
2985 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2986 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2987 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2988 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2989 X0->getType() != X1->getType())
2990 return false;
2991
2992 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2993 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2994
2995 // Attempt to merge shuffles, matching upto 2 source operands.
2996 // Replace index to a poison arg with PoisonMaskElem.
2997 // Bail if either inner masks reference an undef arg.
2998 SmallVector<int, 16> NewMask(OuterMask);
2999 Value *NewX = nullptr, *NewY = nullptr;
3000 for (int &M : NewMask) {
3001 Value *Src = nullptr;
3002 if (0 <= M && M < (int)NumImmElts) {
3003 Src = OuterV0;
3004 if (Match0) {
3005 M = InnerMask0[M];
3006 Src = M >= (int)NumSrcElts ? Y0 : X0;
3007 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
3008 }
3009 } else if (M >= (int)NumImmElts) {
3010 Src = OuterV1;
3011 M -= NumImmElts;
3012 if (Match1) {
3013 M = InnerMask1[M];
3014 Src = M >= (int)NumSrcElts ? Y1 : X1;
3015 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
3016 }
3017 }
3018 if (Src && M != PoisonMaskElem) {
3019 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
3020 if (isa<UndefValue>(Src)) {
3021 // We've referenced an undef element - if its poison, update the shuffle
3022 // mask, else bail.
3023 if (!isa<PoisonValue>(Src))
3024 return false;
3025 M = PoisonMaskElem;
3026 continue;
3027 }
3028 if (!NewX || NewX == Src) {
3029 NewX = Src;
3030 continue;
3031 }
3032 if (!NewY || NewY == Src) {
3033 M += NumSrcElts;
3034 NewY = Src;
3035 continue;
3036 }
3037 return false;
3038 }
3039 }
3040
3041 if (!NewX) {
3042 replaceValue(I, *PoisonValue::get(ShuffleDstTy));
3043 return true;
3044 }
3045
3046 if (!NewY)
3047 NewY = PoisonValue::get(ShuffleSrcTy);
3048
3049 // Have we folded to an Identity shuffle?
3050 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3051 replaceValue(I, *NewX);
3052 return true;
3053 }
3054
3055 // Try to merge the shuffles if the new shuffle is not costly.
3056 InstructionCost InnerCost0 = 0;
3057 if (Match0)
3058 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3059
3060 InstructionCost InnerCost1 = 0;
3061 if (Match1)
3062 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3063
3065
3066 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3067
3068 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3072 InstructionCost NewCost =
3073 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3074 nullptr, {NewX, NewY});
3075 if (!OuterV0->hasOneUse())
3076 NewCost += InnerCost0;
3077 if (!OuterV1->hasOneUse())
3078 NewCost += InnerCost1;
3079
3080 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3081 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3082 << "\n");
3083 if (NewCost > OldCost)
3084 return false;
3085
3086 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3087 replaceValue(I, *Shuf);
3088 return true;
3089}
3090
3091/// Try to convert a chain of length-preserving shuffles that are fed by
3092/// length-changing shuffles from the same source, e.g. a chain of length 3:
3093///
3094/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3095/// (shuffle y, undef)),
3096// (shuffle y, undef)"
3097///
3098/// into a single shuffle fed by a length-changing shuffle:
3099///
3100/// "shuffle x, (shuffle y, undef)"
3101///
3102/// Such chains arise e.g. from folding extract/insert sequences.
3103bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3104 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3105 if (!TrunkType)
3106 return false;
3107
3108 unsigned ChainLength = 0;
3109 SmallVector<int> Mask;
3110 SmallVector<int> YMask;
3111 InstructionCost OldCost = 0;
3112 InstructionCost NewCost = 0;
3113 Value *Trunk = &I;
3114 unsigned NumTrunkElts = TrunkType->getNumElements();
3115 Value *Y = nullptr;
3116
3117 for (;;) {
3118 // Match the current trunk against (commutations of) the pattern
3119 // "shuffle trunk', (shuffle y, undef)"
3120 ArrayRef<int> OuterMask;
3121 Value *OuterV0, *OuterV1;
3122 if (ChainLength != 0 && !Trunk->hasOneUse())
3123 break;
3124 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3125 m_Mask(OuterMask))))
3126 break;
3127 if (OuterV0->getType() != TrunkType) {
3128 // This shuffle is not length-preserving, so it cannot be part of the
3129 // chain.
3130 break;
3131 }
3132
3133 ArrayRef<int> InnerMask0, InnerMask1;
3134 Value *A0, *A1, *B0, *B1;
3135 bool Match0 =
3136 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3137 bool Match1 =
3138 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3139 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3140 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3141 if (Match0Leaf == Match1Leaf) {
3142 // Only handle the case of exactly one leaf in each step. The "two leaves"
3143 // case is handled by foldShuffleOfShuffles.
3144 break;
3145 }
3146
3147 SmallVector<int> CommutedOuterMask;
3148 if (Match0Leaf) {
3149 std::swap(OuterV0, OuterV1);
3150 std::swap(InnerMask0, InnerMask1);
3151 std::swap(A0, A1);
3152 std::swap(B0, B1);
3153 llvm::append_range(CommutedOuterMask, OuterMask);
3154 for (int &M : CommutedOuterMask) {
3155 if (M == PoisonMaskElem)
3156 continue;
3157 if (M < (int)NumTrunkElts)
3158 M += NumTrunkElts;
3159 else
3160 M -= NumTrunkElts;
3161 }
3162 OuterMask = CommutedOuterMask;
3163 }
3164 if (!OuterV1->hasOneUse())
3165 break;
3166
3167 if (!isa<UndefValue>(A1)) {
3168 if (!Y)
3169 Y = A1;
3170 else if (Y != A1)
3171 break;
3172 }
3173 if (!isa<UndefValue>(B1)) {
3174 if (!Y)
3175 Y = B1;
3176 else if (Y != B1)
3177 break;
3178 }
3179
3180 auto *YType = cast<FixedVectorType>(A1->getType());
3181 int NumLeafElts = YType->getNumElements();
3182 SmallVector<int> LocalYMask(InnerMask1);
3183 for (int &M : LocalYMask) {
3184 if (M >= NumLeafElts)
3185 M -= NumLeafElts;
3186 }
3187
3188 InstructionCost LocalOldCost =
3191
3192 // Handle the initial (start of chain) case.
3193 if (!ChainLength) {
3194 Mask.assign(OuterMask);
3195 YMask.assign(LocalYMask);
3196 OldCost = NewCost = LocalOldCost;
3197 Trunk = OuterV0;
3198 ChainLength++;
3199 continue;
3200 }
3201
3202 // For the non-root case, first attempt to combine masks.
3203 SmallVector<int> NewYMask(YMask);
3204 bool Valid = true;
3205 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3206 if (LeafM == -1 || CombinedM == LeafM)
3207 continue;
3208 if (CombinedM == -1) {
3209 CombinedM = LeafM;
3210 } else {
3211 Valid = false;
3212 break;
3213 }
3214 }
3215 if (!Valid)
3216 break;
3217
3218 SmallVector<int> NewMask;
3219 NewMask.reserve(NumTrunkElts);
3220 for (int M : Mask) {
3221 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3222 NewMask.push_back(M);
3223 else
3224 NewMask.push_back(OuterMask[M]);
3225 }
3226
3227 // Break the chain if adding this new step complicates the shuffles such
3228 // that it would increase the new cost by more than the old cost of this
3229 // step.
3230 InstructionCost LocalNewCost =
3232 YType, NewYMask, CostKind) +
3234 TrunkType, NewMask, CostKind);
3235
3236 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3237 break;
3238
3239 LLVM_DEBUG({
3240 if (ChainLength == 1) {
3241 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3242 << I << '\n';
3243 }
3244 dbgs() << " next chain link: " << *Trunk << '\n'
3245 << " old cost: " << (OldCost + LocalOldCost)
3246 << " new cost: " << LocalNewCost << '\n';
3247 });
3248
3249 Mask = NewMask;
3250 YMask = NewYMask;
3251 OldCost += LocalOldCost;
3252 NewCost = LocalNewCost;
3253 Trunk = OuterV0;
3254 ChainLength++;
3255 }
3256 if (ChainLength <= 1)
3257 return false;
3258
3259 // Bail out if all leaves were poison.
3260 if (!Y)
3261 return false;
3262
3263 if (llvm::all_of(Mask, [&](int M) {
3264 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3265 })) {
3266 // Produce a canonical simplified form if all elements are sourced from Y.
3267 for (int &M : Mask) {
3268 if (M >= static_cast<int>(NumTrunkElts))
3269 M = YMask[M - NumTrunkElts];
3270 }
3271 Value *Root =
3272 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3273 replaceValue(I, *Root);
3274 return true;
3275 }
3276
3277 Value *Leaf =
3278 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3279 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3280 replaceValue(I, *Root);
3281 return true;
3282}
3283
3284/// Try to convert
3285/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3286bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3287 Value *V0, *V1;
3288 ArrayRef<int> OldMask;
3289 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3290 return false;
3291
3292 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3293 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3294 if (!II0 || !II1)
3295 return false;
3296
3297 Intrinsic::ID IID = II0->getIntrinsicID();
3298 if (IID != II1->getIntrinsicID())
3299 return false;
3300 InstructionCost CostII0 =
3301 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3302 InstructionCost CostII1 =
3303 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3304
3305 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3306 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3307 if (!ShuffleDstTy || !II0Ty)
3308 return false;
3309
3310 if (!isTriviallyVectorizable(IID))
3311 return false;
3312
3313 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3314 Value *Arg0 = II0->getArgOperand(I);
3315 Value *Arg1 = II1->getArgOperand(I);
3317 // Scalar operands must be identical.
3318 if (Arg0 != Arg1)
3319 return false;
3320 } else if (Arg0->getType() != Arg1->getType()) {
3321 // The corresponding vector operands are shuffled together, so they must
3322 // share the same type. For intrinsics overloaded on their operand type
3323 // (e.g. llvm.fptosi.sat), two calls can produce the same result type
3324 // from different operand types; shuffling those would be invalid.
3325 return false;
3326 }
3327 }
3328
3329 InstructionCost OldCost =
3330 CostII0 + CostII1 +
3332 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3333
3334 SmallVector<Type *> NewArgsTy;
3335 InstructionCost NewCost = 0;
3336 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3337 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3339 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3340 } else {
3341 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3342 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3343 ShuffleDstTy->getNumElements());
3344 NewArgsTy.push_back(ArgTy);
3345 std::pair<Value *, Value *> OperandPair =
3346 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3347 if (!SeenOperandPairs.insert(OperandPair).second) {
3348 // We've already computed the cost for this operand pair.
3349 continue;
3350 }
3351 NewCost += TTI.getShuffleCost(
3352 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3353 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3354 }
3355 }
3356 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3357
3358 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3359 if (!II0->hasOneUse())
3360 NewCost += CostII0;
3361 if (II1 != II0 && !II1->hasOneUse())
3362 NewCost += CostII1;
3363
3364 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3365 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3366 << "\n");
3367
3368 if (NewCost > OldCost)
3369 return false;
3370
3371 SmallVector<Value *> NewArgs;
3372 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3373 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3375 NewArgs.push_back(II0->getArgOperand(I));
3376 } else {
3377 std::pair<Value *, Value *> OperandPair =
3378 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3379 auto It = ShuffleCache.find(OperandPair);
3380 if (It != ShuffleCache.end()) {
3381 // Reuse previously created shuffle for this operand pair.
3382 NewArgs.push_back(It->second);
3383 continue;
3384 }
3385 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3386 II1->getArgOperand(I), OldMask);
3387 ShuffleCache[OperandPair] = Shuf;
3388 NewArgs.push_back(Shuf);
3389 Worklist.pushValue(Shuf);
3390 }
3391 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3392
3393 // Intersect flags from the old intrinsics.
3394 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3395 NewInst->copyIRFlags(II0);
3396 NewInst->andIRFlags(II1);
3397 }
3398
3399 replaceValue(I, *NewIntrinsic);
3400 return true;
3401}
3402
3403/// Try to convert
3404/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3405bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3406 Value *V0;
3407 ArrayRef<int> Mask;
3408 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3409 return false;
3410
3411 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3412 if (!II0)
3413 return false;
3414
3415 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3416 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3417 if (!ShuffleDstTy || !IntrinsicSrcTy)
3418 return false;
3419
3420 // Validate it's a pure permute, mask should only reference the first vector
3421 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3422 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3423 return false;
3424
3425 Intrinsic::ID IID = II0->getIntrinsicID();
3426 if (!isTriviallyVectorizable(IID))
3427 return false;
3428
3429 // Cost analysis
3431 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3432 InstructionCost OldCost =
3435 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3436
3437 SmallVector<Type *> NewArgsTy;
3438 InstructionCost NewCost = 0;
3439 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3441 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3442 } else {
3443 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3444 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3445 ShuffleDstTy->getNumElements());
3446 NewArgsTy.push_back(ArgTy);
3448 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3449 {II0->getArgOperand(I)});
3450 }
3451 }
3452 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3453 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3454
3455 // If the intrinsic has multiple uses, we need to account for the cost of
3456 // keeping the original intrinsic around.
3457 if (!II0->hasOneUse())
3458 NewCost += IntrinsicCost;
3459
3460 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3461 << OldCost << " vs NewCost: " << NewCost << "\n");
3462
3463 if (NewCost > OldCost)
3464 return false;
3465
3466 // Transform
3467 SmallVector<Value *> NewArgs;
3468 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3470 NewArgs.push_back(II0->getArgOperand(I));
3471 } else {
3472 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3473 NewArgs.push_back(Shuf);
3474 Worklist.pushValue(Shuf);
3475 }
3476 }
3477
3478 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3479
3480 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3481 NewInst->copyIRFlags(II0);
3482
3483 replaceValue(I, *NewIntrinsic);
3484 return true;
3485}
3486
3487using InstLane = std::pair<Value *, int>;
3488
3489static InstLane lookThroughShuffles(Value *V, int Lane) {
3490 while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
3491 unsigned NumElts =
3492 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3493 int M = SV->getMaskValue(Lane);
3494 if (M < 0)
3495 return {nullptr, PoisonMaskElem};
3496 if (static_cast<unsigned>(M) < NumElts) {
3497 V = SV->getOperand(0);
3498 Lane = M;
3499 } else {
3500 V = SV->getOperand(1);
3501 Lane = M - NumElts;
3502 }
3503 }
3504 return InstLane{V, Lane};
3505}
3506
3510 for (InstLane IL : Item) {
3511 auto [U, Lane] = IL;
3512 InstLane OpLane =
3513 U ? lookThroughShuffles(cast<Instruction>(U)->getOperand(Op), Lane)
3514 : InstLane{nullptr, PoisonMaskElem};
3515 NItem.emplace_back(OpLane);
3516 }
3517 return NItem;
3518}
3519
3520/// Detect concat of multiple values into a vector
3522 const TargetTransformInfo &TTI) {
3523 auto *Ty = cast<FixedVectorType>(Item.front().first->getType());
3524 unsigned NumElts = Ty->getNumElements();
3525 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3526 return false;
3527
3528 // Check that the concat is free, usually meaning that the type will be split
3529 // during legalization.
3530 SmallVector<int, 16> ConcatMask(NumElts * 2);
3531 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3532 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3533 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3534 Ty, ConcatMask, CostKind) != 0)
3535 return false;
3536
3537 unsigned NumSlices = Item.size() / NumElts;
3538 // Currently we generate a tree of shuffles for the concats, which limits us
3539 // to a power2.
3540 if (!isPowerOf2_32(NumSlices))
3541 return false;
3542 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3543 Value *SliceV = Item[Slice * NumElts].first;
3544 if (!SliceV || SliceV->getType() != Ty)
3545 return false;
3546 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3547 auto [V, Lane] = Item[Slice * NumElts + Elt];
3548 if (Lane != static_cast<int>(Elt) || SliceV != V)
3549 return false;
3550 }
3551 }
3552 return true;
3553}
3554
3555static Value *
3557 const DenseSet<std::pair<Value *, Use *>> &IdentityLeafs,
3558 const DenseSet<std::pair<Value *, Use *>> &SplatLeafs,
3559 const DenseSet<std::pair<Value *, Use *>> &ConcatLeafs,
3560 IRBuilderBase &Builder, const TargetTransformInfo *TTI) {
3561 auto [FrontV, FrontLane] = Item.front();
3562
3563 if (IdentityLeafs.contains(std::make_pair(FrontV, From))) {
3564 return FrontV;
3565 }
3566 if (SplatLeafs.contains(std::make_pair(FrontV, From))) {
3567 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3568 return Builder.CreateShuffleVector(FrontV, Mask);
3569 }
3570 if (ConcatLeafs.contains(std::make_pair(FrontV, From))) {
3571 unsigned NumElts =
3572 cast<FixedVectorType>(FrontV->getType())->getNumElements();
3573 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3574 for (unsigned S = 0; S < Values.size(); ++S)
3575 Values[S] = Item[S * NumElts].first;
3576
3577 while (Values.size() > 1) {
3578 NumElts *= 2;
3579 SmallVector<int, 16> Mask(NumElts, 0);
3580 std::iota(Mask.begin(), Mask.end(), 0);
3581 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3582 for (unsigned S = 0; S < NewValues.size(); ++S)
3583 NewValues[S] =
3584 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3585 Values = NewValues;
3586 }
3587 return Values[0];
3588 }
3589
3590 auto *I = cast<Instruction>(FrontV);
3591 auto *II = dyn_cast<IntrinsicInst>(I);
3592 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3594 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3595 if (II &&
3596 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3597 Ops[Idx] = II->getOperand(Idx);
3598 continue;
3599 }
3601 &I->getOperandUse(Idx), Ty, IdentityLeafs,
3602 SplatLeafs, ConcatLeafs, Builder, TTI);
3603 }
3604
3605 SmallVector<Value *, 8> ValueList;
3606 for (const auto &Lane : Item)
3607 if (Lane.first)
3608 ValueList.push_back(Lane.first);
3609
3610 Type *DstTy =
3611 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3612 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3613 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3614 Ops[0], Ops[1]);
3615 propagateIRFlags(Value, ValueList);
3616 return Value;
3617 }
3618 if (auto *CI = dyn_cast<CmpInst>(I)) {
3619 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3620 propagateIRFlags(Value, ValueList);
3621 return Value;
3622 }
3623 if (auto *SI = dyn_cast<SelectInst>(I)) {
3624 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3625 propagateIRFlags(Value, ValueList);
3626 return Value;
3627 }
3628 if (auto *CI = dyn_cast<CastInst>(I)) {
3629 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3630 propagateIRFlags(Value, ValueList);
3631 return Value;
3632 }
3633 if (II) {
3634 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3635 propagateIRFlags(Value, ValueList);
3636 return Value;
3637 }
3638 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3639 auto *Value =
3640 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3641 propagateIRFlags(Value, ValueList);
3642 return Value;
3643}
3644
3645// Starting from a shuffle, look up through operands tracking the shuffled index
3646// of each lane. If we can simplify away the shuffles to identities then
3647// do so.
3648bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3649 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3650 if (!Ty || I.use_empty())
3651 return false;
3652
3653 SmallVector<InstLane> Start(Ty->getNumElements());
3654 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3655 Start[M] = lookThroughShuffles(&I, M);
3656
3658 Worklist.push_back(std::make_pair(Start, &*I.use_begin()));
3659 DenseSet<std::pair<Value *, Use *>> IdentityLeafs, SplatLeafs, ConcatLeafs;
3660 unsigned NumVisited = 0;
3661
3662 while (!Worklist.empty()) {
3663 if (++NumVisited > MaxInstrsToScan)
3664 return false;
3665
3666 auto ItemFrom = Worklist.pop_back_val();
3667 auto Item = ItemFrom.first;
3668 auto From = ItemFrom.second;
3669 auto [FrontV, FrontLane] = Item.front();
3670
3671 // If we found an undef first lane then bail out to keep things simple.
3672 if (!FrontV)
3673 return false;
3674
3675 // Helper to peek through bitcasts to the same value.
3676 auto IsEquiv = [&](Value *X, Value *Y) {
3677 return X->getType() == Y->getType() &&
3679 };
3680
3681 // Look for an identity value.
3682 if (FrontLane == 0 &&
3683 cast<FixedVectorType>(FrontV->getType())->getNumElements() ==
3684 Ty->getNumElements() &&
3685 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3686 Value *FrontV = Item.front().first;
3687 return !E.value().first || (IsEquiv(E.value().first, FrontV) &&
3688 E.value().second == (int)E.index());
3689 })) {
3690 IdentityLeafs.insert(std::make_pair(FrontV, From));
3691 continue;
3692 }
3693 // Look for constants, for the moment only supporting constant splats.
3694 if (auto *C = dyn_cast<Constant>(FrontV);
3695 C && C->getSplatValue() &&
3696 all_of(drop_begin(Item), [Item](InstLane &IL) {
3697 Value *FrontV = Item.front().first;
3698 Value *V = IL.first;
3699 return !V || (isa<Constant>(V) &&
3700 cast<Constant>(V)->getSplatValue() ==
3701 cast<Constant>(FrontV)->getSplatValue());
3702 })) {
3703 SplatLeafs.insert(std::make_pair(FrontV, From));
3704 continue;
3705 }
3706 // Look for a splat value.
3707 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3708 auto [FrontV, FrontLane] = Item.front();
3709 auto [V, Lane] = IL;
3710 return !V || (V == FrontV && Lane == FrontLane);
3711 })) {
3712 SplatLeafs.insert(std::make_pair(FrontV, From));
3713 continue;
3714 }
3715
3716 // We need each element to be the same type of value, and check that each
3717 // element has a single use.
3718 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3719 Value *FrontV = Item.front().first;
3720 if (!IL.first)
3721 return true;
3722 Value *V = IL.first;
3723 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3724 return false;
3725 if (V->getValueID() != FrontV->getValueID())
3726 return false;
3727 if (auto *CI = dyn_cast<CmpInst>(V))
3728 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3729 return false;
3730 if (auto *CI = dyn_cast<CastInst>(V))
3731 if (CI->getSrcTy()->getScalarType() !=
3732 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3733 return false;
3734 if (auto *SI = dyn_cast<SelectInst>(V))
3735 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3736 SI->getOperand(0)->getType() !=
3737 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3738 return false;
3739 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3740 return false;
3741 auto *II = dyn_cast<IntrinsicInst>(V);
3742 return !II || (isa<IntrinsicInst>(FrontV) &&
3743 II->getIntrinsicID() ==
3744 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3745 !II->hasOperandBundles());
3746 };
3747 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3748 // Check the operator is one that we support.
3749 if (isa<BinaryOperator, CmpInst>(FrontV)) {
3750 // We exclude div/rem in case they hit UB from poison lanes.
3751 if (auto *BO = dyn_cast<BinaryOperator>(FrontV);
3752 BO && BO->isIntDivRem())
3753 return false;
3755 &cast<Instruction>(FrontV)->getOperandUse(0));
3757 &cast<Instruction>(FrontV)->getOperandUse(1));
3758 continue;
3759 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3760 FPToUIInst, SIToFPInst, UIToFPInst>(FrontV)) {
3762 &cast<Instruction>(FrontV)->getOperandUse(0));
3763 continue;
3764 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontV)) {
3765 // TODO: Handle vector widening/narrowing bitcasts.
3766 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3767 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3768 if (DstTy && SrcTy &&
3769 SrcTy->getNumElements() == DstTy->getNumElements()) {
3771 &BitCast->getOperandUse(0));
3772 continue;
3773 }
3774 } else if (auto *Sel = dyn_cast<SelectInst>(FrontV)) {
3776 &Sel->getOperandUse(0));
3778 &Sel->getOperandUse(1));
3780 &Sel->getOperandUse(2));
3781 continue;
3782 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
3783 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3784 !II->hasOperandBundles()) {
3785 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3786 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3787 &TTI)) {
3788 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3789 Value *FrontV = Item.front().first;
3790 Value *V = IL.first;
3791 return !V || (cast<Instruction>(V)->getOperand(Op) ==
3792 cast<Instruction>(FrontV)->getOperand(Op));
3793 }))
3794 return false;
3795 continue;
3796 }
3798 &cast<Instruction>(FrontV)->getOperandUse(Op));
3799 }
3800 continue;
3801 }
3802 }
3803
3804 if (isFreeConcat(Item, CostKind, TTI)) {
3805 ConcatLeafs.insert(std::make_pair(FrontV, From));
3806 continue;
3807 }
3808
3809 return false;
3810 }
3811
3812 if (NumVisited <= 1)
3813 return false;
3814
3815 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3816
3817 // If we got this far, we know the shuffles are superfluous and can be
3818 // removed. Scan through again and generate the new tree of instructions.
3819 Builder.SetInsertPoint(&I);
3820 Value *V = generateNewInstTree(Start, &*I.use_begin(), Ty, IdentityLeafs,
3821 SplatLeafs, ConcatLeafs, Builder, &TTI);
3822 replaceValue(I, *V);
3823 return true;
3824}
3825
3826/// Given a commutative reduction, the order of the input lanes does not alter
3827/// the results. We can use this to remove certain shuffles feeding the
3828/// reduction, removing the need to shuffle at all.
3829bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3830 auto *II = dyn_cast<IntrinsicInst>(&I);
3831 if (!II)
3832 return false;
3833 switch (II->getIntrinsicID()) {
3834 case Intrinsic::vector_reduce_add:
3835 case Intrinsic::vector_reduce_mul:
3836 case Intrinsic::vector_reduce_and:
3837 case Intrinsic::vector_reduce_or:
3838 case Intrinsic::vector_reduce_xor:
3839 case Intrinsic::vector_reduce_smin:
3840 case Intrinsic::vector_reduce_smax:
3841 case Intrinsic::vector_reduce_umin:
3842 case Intrinsic::vector_reduce_umax:
3843 break;
3844 default:
3845 return false;
3846 }
3847
3848 // Find all the inputs when looking through operations that do not alter the
3849 // lane order (binops, for example). Currently we look for a single shuffle,
3850 // and can ignore splat values.
3851 std::queue<Value *> Worklist;
3852 SmallPtrSet<Value *, 4> Visited;
3853 ShuffleVectorInst *Shuffle = nullptr;
3854 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3855 Worklist.push(Op);
3856
3857 while (!Worklist.empty()) {
3858 Value *CV = Worklist.front();
3859 Worklist.pop();
3860 if (Visited.contains(CV))
3861 continue;
3862
3863 // Splats don't change the order, so can be safely ignored.
3864 if (isSplatValue(CV))
3865 continue;
3866
3867 Visited.insert(CV);
3868
3869 if (auto *CI = dyn_cast<Instruction>(CV)) {
3870 if (CI->isBinaryOp()) {
3871 for (auto *Op : CI->operand_values())
3872 Worklist.push(Op);
3873 continue;
3874 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3875 if (Shuffle && Shuffle != SV)
3876 return false;
3877 Shuffle = SV;
3878 continue;
3879 }
3880 }
3881
3882 // Anything else is currently an unknown node.
3883 return false;
3884 }
3885
3886 if (!Shuffle)
3887 return false;
3888
3889 // Check all uses of the binary ops and shuffles are also included in the
3890 // lane-invariant operations (Visited should be the list of lanewise
3891 // instructions, including the shuffle that we found).
3892 for (auto *V : Visited)
3893 for (auto *U : V->users())
3894 if (!Visited.contains(U) && U != &I)
3895 return false;
3896
3897 FixedVectorType *VecType =
3898 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3899 if (!VecType)
3900 return false;
3901 FixedVectorType *ShuffleInputType =
3903 if (!ShuffleInputType)
3904 return false;
3905 unsigned NumInputElts = ShuffleInputType->getNumElements();
3906
3907 // Find the mask from sorting the lanes into order. This is most likely to
3908 // become a identity or concat mask. Undef elements are pushed to the end.
3909 SmallVector<int> ConcatMask;
3910 Shuffle->getShuffleMask(ConcatMask);
3911 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3912 bool UsesSecondVec =
3913 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3914
3916 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3917 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3919 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3920 ShuffleInputType, ConcatMask, CostKind);
3921
3922 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3923 << "\n");
3924 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3925 << "\n");
3926 bool MadeChanges = false;
3927 if (NewCost < OldCost) {
3928 Builder.SetInsertPoint(Shuffle);
3929 Value *NewShuffle = Builder.CreateShuffleVector(
3930 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3931 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3932 replaceValue(*Shuffle, *NewShuffle);
3933 return true;
3934 }
3935
3936 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3937 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3938 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3939 return MadeChanges;
3940}
3941
3942/// Try to fold a chain of shuffles and ops feeding extractelement(..., 0)
3943/// into llvm.vector.reduce.*, by tracking which lanes contribute to the
3944/// extracted lane and reducing the widest vector whose lanes each contribute
3945/// once.
3946///
3947/// For example:
3948///
3949/// %lo = shufflevector <4 x i32> %a, poison, <2 x i32> <i32 0, i32 1>
3950/// %hi = shufflevector <4 x i32> %a, poison, <2 x i32> <i32 2, i32 3>
3951/// %s = add <2 x i32> %lo, %hi
3952/// %sh = shufflevector <2 x i32> %s, poison, <2 x i32> <i32 1, i32 poison>
3953/// %r = add <2 x i32> %s, %sh
3954/// %e = extractelement <2 x i32> %r, i64 0
3955///
3956/// transforms to:
3957///
3958/// %e = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
3959bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3960 Value *VecOpEE;
3961 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3962 return false;
3963
3964 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3965 if (!FVT)
3966 return false;
3967
3968 if (FVT->getNumElements() < 2)
3969 return false;
3970
3971 std::optional<Instruction::BinaryOps> CommonBinOp;
3972 std::optional<Intrinsic::ID> CommonCallOp;
3973
3974 if (auto *BO = dyn_cast<BinaryOperator>(VecOpEE)) {
3975 if (!getReductionForBinop(BO->getOpcode()))
3976 return false;
3977 CommonBinOp = BO->getOpcode();
3978 } else if (auto *MMI = dyn_cast<MinMaxIntrinsic>(VecOpEE)) {
3979 CommonCallOp = MMI->getIntrinsicID();
3980 } else {
3981 return false;
3982 }
3983
3984 // For floating-point reductions, track FMF intersection across all binops.
3985 FastMathFlags CommonFMF;
3986 bool IsFloatReduction = false;
3987
3988 // A chain node is one we walk through, either a matching-opcode binop/min-max
3989 // or a single-source shuffle. Anything else is a leaf source.
3990 auto IsChainNode = [&](Value *V) {
3991 if (auto *BO = dyn_cast<BinaryOperator>(V))
3992 return CommonBinOp && BO->getOpcode() == *CommonBinOp;
3993 if (auto *MMI = dyn_cast<MinMaxIntrinsic>(V))
3994 return CommonCallOp && MMI->getIntrinsicID() == *CommonCallOp;
3995 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
3996 return isa<PoisonValue>(SVI->getOperand(1));
3997 return false;
3998 };
3999
4000 // Collect the chain, building Nodes in postorder. Bail if the chain is empty
4001 // or exceeds MaxChainNodes.
4002 constexpr unsigned MaxChainNodes = 32;
4003 SmallSetVector<Value *, 16> Nodes;
4004 SmallSetVector<Value *, 4> Sources;
4005 unsigned NumVisited = 0;
4006 auto AddSource = [&](Value *V) {
4007 if (!isa<FixedVectorType>(V->getType()))
4008 return false;
4009 Sources.insert(V);
4010 return true;
4011 };
4012 auto Walk = [&](Value *V, auto &&Walk) -> bool {
4013 if (Nodes.contains(V) || Sources.contains(V))
4014 return true;
4015 if (++NumVisited > MaxChainNodes)
4016 return false;
4017 if (!IsChainNode(V))
4018 return AddSource(V);
4019 // Chain shuffles always have poison as op1, so only op0 matters.
4020 auto *U = cast<Instruction>(V);
4021 unsigned NumOps = isa<ShuffleVectorInst>(U) ? 1 : 2;
4022 for (unsigned I = 0; I != NumOps; ++I)
4023 if (!Walk(U->getOperand(I), Walk))
4024 return false;
4025 if (isa<ShuffleVectorInst>(U) || Nodes.contains(U->getOperand(0)) ||
4026 Nodes.contains(U->getOperand(1))) {
4027 Nodes.insert(V);
4028 return true;
4029 }
4030 // Both operands are leaves so treat this binop as a source rather than
4031 // walking into it.
4032 return AddSource(V);
4033 };
4034 if (!Walk(VecOpEE, Walk) || Nodes.empty())
4035 return false;
4036
4037 bool IsIdempotent =
4038 CommonCallOp || (CommonBinOp && Instruction::isIdempotent(*CommonBinOp));
4039
4040 // For FP reductions, require reassoc on every binop and collect FMF.
4041 for (Value *V : Nodes) {
4042 auto *BinOp = dyn_cast<BinaryOperator>(V);
4043 if (!BinOp || !BinOp->getType()->isFPOrFPVectorTy())
4044 continue;
4045 if (!BinOp->hasAllowReassoc())
4046 return false;
4047 if (!IsFloatReduction) {
4048 CommonFMF = BinOp->getFastMathFlags();
4049 IsFloatReduction = true;
4050 } else {
4051 CommonFMF &= BinOp->getFastMathFlags();
4052 }
4053 }
4054
4055 // Top-down demanded elements. For each chain value, track which lanes feed
4056 // the extracted lane 0 and which feed it more than once. Reverse postorder
4057 // visits every use before its value. A binop forwards its demand to both
4058 // operands and a shuffle follows its mask back to the source lane.
4059 struct Demand {
4060 APInt Lanes;
4061 APInt Duplicates;
4062 };
4063 DenseMap<Value *, Demand> Demands;
4064 auto DemandOf = [&](Value *V) -> Demand & {
4065 unsigned N = cast<FixedVectorType>(V->getType())->getNumElements();
4066 Demand &D = Demands[V];
4067 if (D.Lanes.getBitWidth() != N)
4068 D.Lanes = D.Duplicates = APInt::getZero(N);
4069 return D;
4070 };
4071 DemandOf(VecOpEE).Lanes.setBit(0);
4072 for (Value *V : reverse(Nodes)) {
4073 Demand DV = Demands.lookup(V);
4074 if (DV.Lanes.isZero())
4075 continue;
4076 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V)) {
4077 ArrayRef<int> Mask = SVI->getShuffleMask();
4078 Demand &DS = DemandOf(SVI->getOperand(0));
4079 for (unsigned I = 0, E = Mask.size(); I != E; ++I) {
4080 // Skip lanes that are undemanded or map to poison.
4081 if (!DV.Lanes[I] || Mask[I] < 0 ||
4082 (unsigned)Mask[I] >= DS.Lanes.getBitWidth())
4083 continue;
4084 if (DS.Lanes[Mask[I]] || DV.Duplicates[I])
4085 DS.Duplicates.setBit(Mask[I]);
4086 DS.Lanes.setBit(Mask[I]);
4087 }
4088 } else {
4089 auto *U = cast<User>(V);
4090 for (Value *Op : {U->getOperand(0), U->getOperand(1)}) {
4091 Demand &DOp = DemandOf(Op);
4092 // Lanes demanded through more than one path accumulate in Duplicates.
4093 DOp.Duplicates |= DV.Duplicates | (DOp.Lanes & DV.Lanes);
4094 DOp.Lanes |= DV.Lanes;
4095 }
4096 }
4097 }
4098
4099 // Reducing V replaces the entire chain, so every contribution to the result
4100 // must flow through V. Reject if anything above V reads outside the chain.
4101 auto CoversChain = [&](Value *V) {
4102 SmallVector<Value *, 8> Worklist(1, VecOpEE);
4103 SmallPtrSet<Value *, 8> Seen;
4104 Seen.insert(VecOpEE);
4105 while (!Worklist.empty()) {
4106 auto *U = cast<Instruction>(Worklist.pop_back_val());
4107 unsigned NumOps = isa<ShuffleVectorInst>(U) ? 1 : 2;
4108 for (unsigned I = 0; I != NumOps; ++I) {
4109 Value *Op = U->getOperand(I);
4110 if (Op == V || !Seen.insert(Op).second)
4111 continue;
4112 if (!Nodes.contains(Op))
4113 return false;
4114 Worklist.push_back(Op);
4115 }
4116 }
4117 return true;
4118 };
4119
4120 // Reduce a single cleanly demanded source if there is one, otherwise the
4121 // deepest intermediate that covers the chain.
4122 struct ReductionCut {
4123 Value *Src;
4124 APInt Elts;
4125 };
4126 std::optional<ReductionCut> Cut;
4127 for (Value *S : Sources) {
4128 auto It = Demands.find(S);
4129 if (It == Demands.end() || It->second.Lanes.isZero())
4130 continue;
4131 if (Cut || (!IsIdempotent && !It->second.Duplicates.isZero())) {
4132 Cut.reset();
4133 break;
4134 }
4135 Cut = ReductionCut{S, It->second.Lanes};
4136 }
4137 if (!Cut) {
4138 for (Value *V : Nodes) {
4140 continue;
4141 auto It = Demands.find(V);
4142 if (It == Demands.end() || !It->second.Lanes.isAllOnes())
4143 continue;
4144 if (!IsIdempotent && !It->second.Duplicates.isZero())
4145 continue;
4146 if (!CoversChain(V))
4147 continue;
4148 Cut = ReductionCut{V, It->second.Lanes};
4149 break;
4150 }
4151 }
4152 // Reducing one lane is just an extract and can refold forever.
4153 if (!Cut || Cut->Elts.popcount() < 2)
4154 return false;
4155
4156 Intrinsic::ID ReducedOp =
4157 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4158 : getReductionForBinop(*CommonBinOp));
4159 if (!ReducedOp)
4160 return false;
4161
4162 InstructionCost OrigCost = 0;
4163 for (Value *V : Nodes)
4165
4166 auto *SrcVT = cast<FixedVectorType>(Cut->Src->getType());
4167 bool IsPartialReduction = !Cut->Elts.isAllOnes();
4168 FixedVectorType *ReduceVecTy =
4169 IsPartialReduction
4170 ? FixedVectorType::get(FVT->getElementType(), Cut->Elts.popcount())
4171 : SrcVT;
4172
4173 SmallVector<int> ExtractMask;
4174 InstructionCost NewCost = 0;
4175 if (IsPartialReduction) {
4176 for (unsigned I = 0, E = Cut->Elts.getBitWidth(); I != E; ++I)
4177 if (Cut->Elts[I])
4178 ExtractMask.push_back(I);
4179 unsigned SubIdx = 0, SubLen;
4180 auto SK = Cut->Elts.isShiftedMask(SubIdx, SubLen)
4183 NewCost += TTI.getShuffleCost(SK, ReduceVecTy, SrcVT, ExtractMask, CostKind,
4184 SubIdx, ReduceVecTy);
4185 }
4186
4187 IntrinsicCostAttributes ICA(
4188 ReducedOp, ReduceVecTy->getElementType(),
4189 IsFloatReduction
4190 ? SmallVector<Type *, 2>{ReduceVecTy->getElementType(), ReduceVecTy}
4191 : SmallVector<Type *, 2>{ReduceVecTy},
4192 IsFloatReduction ? CommonFMF : FastMathFlags());
4193 NewCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4194
4195 LLVM_DEBUG(dbgs() << "Found reduction shuffle chain: " << I << "\n OldCost : "
4196 << OrigCost << " vs NewCost: " << NewCost << "\n");
4197
4198 if (!OrigCost.isValid() || !NewCost.isValid())
4199 return false;
4200
4201 if (VecOpEE->hasOneUse() ? (NewCost > OrigCost) : (NewCost >= OrigCost))
4202 return false;
4203
4204 Value *ReduceInput = Cut->Src;
4205 if (IsPartialReduction)
4206 ReduceInput = Builder.CreateShuffleVector(Cut->Src, ExtractMask);
4207
4208 Value *ReducedResult;
4209 if (IsFloatReduction) {
4211 *CommonBinOp, ReduceVecTy->getElementType(), /*AllowRHSConstant=*/false,
4212 CommonFMF.noSignedZeros());
4213 ReducedResult = Builder.CreateIntrinsic(ReducedOp, {ReduceVecTy},
4214 {Identity, ReduceInput}, CommonFMF);
4215 } else {
4216 ReducedResult =
4217 Builder.CreateIntrinsic(ReducedOp, {ReduceVecTy}, {ReduceInput});
4218 }
4219 replaceValue(I, *ReducedResult);
4220
4221 return true;
4222}
4223
4224/// Determine if its more efficient to fold:
4225/// reduce(trunc(x)) -> trunc(reduce(x)).
4226/// reduce(sext(x)) -> sext(reduce(x)).
4227/// reduce(zext(x)) -> zext(reduce(x)).
4228bool VectorCombine::foldCastFromReductions(Instruction &I) {
4229 auto *II = dyn_cast<IntrinsicInst>(&I);
4230 if (!II)
4231 return false;
4232
4233 bool TruncOnly = false;
4234 Intrinsic::ID IID = II->getIntrinsicID();
4235 switch (IID) {
4236 case Intrinsic::vector_reduce_add:
4237 case Intrinsic::vector_reduce_mul:
4238 TruncOnly = true;
4239 break;
4240 case Intrinsic::vector_reduce_and:
4241 case Intrinsic::vector_reduce_or:
4242 case Intrinsic::vector_reduce_xor:
4243 break;
4244 default:
4245 return false;
4246 }
4247
4248 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4249 Value *ReductionSrc = I.getOperand(0);
4250
4251 Value *Src;
4252 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4253 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4254 return false;
4255
4256 auto CastOpc =
4257 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4258
4259 auto *SrcTy = cast<VectorType>(Src->getType());
4260 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4261 Type *ResultTy = I.getType();
4262
4264 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4265 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4267 cast<CastInst>(ReductionSrc));
4268 InstructionCost NewCost =
4269 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4270 CostKind) +
4271 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4273
4274 if (OldCost <= NewCost || !NewCost.isValid())
4275 return false;
4276
4277 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4278 II->getIntrinsicID(), {Src});
4279 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4280 replaceValue(I, *NewCast);
4281 return true;
4282}
4283
4284/// Fold:
4285/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4286/// into:
4287/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4288///
4289/// Sign-bit reductions produce values with known semantics:
4290/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4291/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4292/// - reduce.add: count of negative elements (0 to NumElts)
4293///
4294/// Both lshr and ashr are supported:
4295/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4296/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4297///
4298/// The fold generalizes to multiple source vectors combined with the same
4299/// operation as the reduction. For example:
4300/// reduce.or(or(shr A, shr B)) conceptually extends the vector
4301/// For reduce.add, this changes the count to M*N where M is the number of
4302/// source vectors.
4303///
4304/// We transform to a direct sign check on the original vector using
4305/// reduce.{or,umax} or reduce.{and,umin}.
4306///
4307/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4308bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4309 CmpPredicate Pred;
4310 IntrinsicInst *ReduceOp;
4311 const APInt *CmpVal;
4312 if (!match(&I,
4313 m_ICmp(Pred, m_OneUse(m_AnyIntrinsic(ReduceOp)), m_APInt(CmpVal))))
4314 return false;
4315
4316 Intrinsic::ID OrigIID = ReduceOp->getIntrinsicID();
4317 switch (OrigIID) {
4318 case Intrinsic::vector_reduce_or:
4319 case Intrinsic::vector_reduce_umax:
4320 case Intrinsic::vector_reduce_and:
4321 case Intrinsic::vector_reduce_umin:
4322 case Intrinsic::vector_reduce_add:
4323 break;
4324 default:
4325 return false;
4326 }
4327
4328 Value *ReductionSrc = ReduceOp->getArgOperand(0);
4329 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4330 if (!VecTy)
4331 return false;
4332
4333 unsigned BitWidth = VecTy->getScalarSizeInBits();
4334 if (BitWidth == 1)
4335 return false;
4336
4337 unsigned NumElts = VecTy->getNumElements();
4338
4339 // Determine the expected tree opcode for multi-vector patterns.
4340 // The tree opcode must match the reduction's underlying operation.
4341 //
4342 // TODO: for pairs of equivalent operators, we should match both,
4343 // not only the most common.
4344 Instruction::BinaryOps TreeOpcode;
4345 switch (OrigIID) {
4346 case Intrinsic::vector_reduce_or:
4347 case Intrinsic::vector_reduce_umax:
4348 TreeOpcode = Instruction::Or;
4349 break;
4350 case Intrinsic::vector_reduce_and:
4351 case Intrinsic::vector_reduce_umin:
4352 TreeOpcode = Instruction::And;
4353 break;
4354 case Intrinsic::vector_reduce_add:
4355 TreeOpcode = Instruction::Add;
4356 break;
4357 default:
4358 llvm_unreachable("Unexpected intrinsic");
4359 }
4360
4361 // Collect sign-bit extraction leaves from an associative tree of TreeOpcode.
4362 // The tree conceptually extends the vector being reduced.
4363 SmallVector<Value *, 8> Worklist;
4364 SmallVector<Value *, 8> Sources; // Original vectors (X in shr X, BW-1)
4365 Worklist.push_back(ReductionSrc);
4366 std::optional<bool> IsAShr;
4367 constexpr unsigned MaxSources = 8;
4368
4369 // Calculate old cost: all shifts + tree ops + reduction
4370 InstructionCost OldCost = TTI.getInstructionCost(ReduceOp, CostKind);
4371
4372 while (!Worklist.empty() && Worklist.size() <= MaxSources &&
4373 Sources.size() <= MaxSources) {
4374 Value *V = Worklist.pop_back_val();
4375
4376 // Try to match sign-bit extraction: shr X, (bitwidth-1)
4377 Value *X;
4378 if (match(V, m_OneUse(m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))) {
4379 auto *Shr = cast<Instruction>(V);
4380
4381 // All shifts must be the same type (all lshr or all ashr)
4382 bool ThisIsAShr = Shr->getOpcode() == Instruction::AShr;
4383 if (!IsAShr)
4384 IsAShr = ThisIsAShr;
4385 else if (*IsAShr != ThisIsAShr)
4386 return false;
4387
4388 Sources.push_back(X);
4389
4390 // As part of the fold, we remove all of the shifts, so we need to keep
4391 // track of their costs.
4392 OldCost += TTI.getInstructionCost(Shr, CostKind);
4393
4394 continue;
4395 }
4396
4397 // Try to extend through a tree node of the expected opcode
4398 Value *A, *B;
4399 if (!match(V, m_OneUse(m_BinOp(TreeOpcode, m_Value(A), m_Value(B)))))
4400 return false;
4401
4402 // We are potentially replacing these operations as well, so we add them
4403 // to the costs.
4405
4406 Worklist.push_back(A);
4407 Worklist.push_back(B);
4408 }
4409
4410 // Must have at least one source and not exceed limit
4411 if (Sources.empty() || Sources.size() > MaxSources ||
4412 Worklist.size() > MaxSources || !IsAShr)
4413 return false;
4414
4415 unsigned NumSources = Sources.size();
4416
4417 // For reduce.add, the total count must fit as a signed integer.
4418 // Range is [0, M*N] for lshr or [-M*N, 0] for ashr.
4419 if (OrigIID == Intrinsic::vector_reduce_add &&
4420 !isIntN(BitWidth, NumSources * NumElts))
4421 return false;
4422
4423 // Compute the boundary value when all elements are negative:
4424 // - Per-element contribution: 1 for lshr, -1 for ashr
4425 // - For add: M*N (total elements across all sources); for others: just 1
4426 unsigned Count =
4427 (OrigIID == Intrinsic::vector_reduce_add) ? NumSources * NumElts : 1;
4428 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4429 if (*IsAShr)
4430 NegativeVal.negate();
4431
4432 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4433 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4434 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4435 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4436
4437 // Determine comparison semantics:
4438 // - IsEq: true for equality test, false for inequality
4439 // - TestsNegative: true if testing against AllNegVal, false for zero
4440 //
4441 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4442 // that fold to boundary tests given the narrow value range:
4443 // < RangeHigh -> != RangeHigh
4444 // > RangeHigh-1 -> == RangeHigh
4445 // > RangeLow -> != RangeLow
4446 // < RangeLow+1 -> == RangeLow
4447 //
4448 // For inequalities, we work with signed predicates only. Unsigned predicates
4449 // are canonicalized to signed when the range is non-negative (where they are
4450 // equivalent). When the range includes negative values, unsigned predicates
4451 // would have different semantics due to wrap-around, so we reject them.
4452 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4453 if (RangeLow.isNegative())
4454 return false;
4455 Pred = ICmpInst::getSignedPredicate(Pred);
4456 }
4457
4458 bool IsEq;
4459 bool TestsNegative;
4460 if (ICmpInst::isEquality(Pred)) {
4461 if (CmpVal->isZero()) {
4462 TestsNegative = false;
4463 } else if (*CmpVal == NegativeVal) {
4464 TestsNegative = true;
4465 } else {
4466 return false;
4467 }
4468 IsEq = Pred == ICmpInst::ICMP_EQ;
4469 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4470 IsEq = false;
4471 TestsNegative = (RangeHigh == NegativeVal);
4472 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4473 IsEq = true;
4474 TestsNegative = (RangeHigh == NegativeVal);
4475 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4476 IsEq = false;
4477 TestsNegative = (RangeLow == NegativeVal);
4478 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4479 IsEq = true;
4480 TestsNegative = (RangeLow == NegativeVal);
4481 } else {
4482 return false;
4483 }
4484
4485 // For this fold we support four types of checks:
4486 //
4487 // 1. All lanes are negative - AllNeg
4488 // 2. All lanes are non-negative - AllNonNeg
4489 // 3. At least one negative lane - AnyNeg
4490 // 4. At least one non-negative lane - AnyNonNeg
4491 //
4492 // For each case, we can generate the following code:
4493 //
4494 // 1. AllNeg - reduce.and/umin(X) < 0
4495 // 2. AllNonNeg - reduce.or/umax(X) > -1
4496 // 3. AnyNeg - reduce.or/umax(X) < 0
4497 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4498 //
4499 // The table below shows the aggregation of all supported cases
4500 // using these four cases.
4501 //
4502 // Reduction | == 0 | != 0 | == MAX | != MAX
4503 // ------------+-----------+-----------+-----------+-----------
4504 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4505 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4506 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4507 //
4508 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4509 //
4510 // For easier codegen and check inversion, we use the following encoding:
4511 //
4512 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4513 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4514 // 3. Bit-1 === universal (1) or existential (0) check
4515 //
4516 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4517 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4518 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4519 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4520 //
4521 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4522 //
4523 enum CheckKind : unsigned {
4524 AnyNonNeg = 0b000,
4525 AllNeg = 0b011,
4526 AllNonNeg = 0b101,
4527 AnyNeg = 0b110,
4528 };
4529 // Return true if we fold this check into or/umax and false for and/umin
4530 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4531 // Return true if we should check if result is negative and false otherwise
4532 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4533 // Logically invert the check
4534 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4535
4536 CheckKind Base;
4537 switch (OrigIID) {
4538 case Intrinsic::vector_reduce_or:
4539 case Intrinsic::vector_reduce_umax:
4540 Base = TestsNegative ? AnyNeg : AllNonNeg;
4541 break;
4542 case Intrinsic::vector_reduce_and:
4543 case Intrinsic::vector_reduce_umin:
4544 Base = TestsNegative ? AllNeg : AnyNonNeg;
4545 break;
4546 case Intrinsic::vector_reduce_add:
4547 Base = TestsNegative ? AllNeg : AllNonNeg;
4548 break;
4549 default:
4550 llvm_unreachable("Unexpected intrinsic");
4551 }
4552
4553 CheckKind Check = IsEq ? Base : Invert(Base);
4554
4555 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4556 InstructionCost ArithCost =
4558 VecTy, std::nullopt, CostKind);
4559 InstructionCost MinMaxCost =
4561 FastMathFlags(), CostKind);
4562 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4563 : std::make_pair(MinMax, MinMaxCost);
4564 };
4565
4566 // Choose output reduction based on encoding's MSB
4567 auto [NewIID, NewCost] = RequiresOr(Check)
4568 ? PickCheaper(Intrinsic::vector_reduce_or,
4569 Intrinsic::vector_reduce_umax)
4570 : PickCheaper(Intrinsic::vector_reduce_and,
4571 Intrinsic::vector_reduce_umin);
4572
4573 // Add cost of combining multiple sources with or/and
4574 if (NumSources > 1) {
4575 unsigned CombineOpc =
4576 RequiresOr(Check) ? Instruction::Or : Instruction::And;
4577 NewCost += TTI.getArithmeticInstrCost(CombineOpc, VecTy, CostKind) *
4578 (NumSources - 1);
4579 }
4580
4581 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4582 << OldCost << " vs NewCost: " << NewCost << "\n");
4583
4584 if (NewCost > OldCost)
4585 return false;
4586
4587 // Generate the combined input and reduction
4588 Builder.SetInsertPoint(&I);
4589 Type *ScalarTy = VecTy->getScalarType();
4590
4591 Value *Input;
4592 if (NumSources == 1) {
4593 Input = Sources[0];
4594 } else {
4595 // Combine sources with or/and based on check type
4596 Input = RequiresOr(Check) ? Builder.CreateOr(Sources)
4597 : Builder.CreateAnd(Sources);
4598 }
4599
4600 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {Input});
4601 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4602 : Builder.CreateIsNotNeg(NewReduce);
4603 replaceValue(I, *NewCmp);
4604 return true;
4605}
4606
4607/// Fold a zero test of reduce.or or reduce.umax into a boolean reduction.
4608///
4609/// Vectorization may produce IR that compares the result of a scalar reduction
4610/// with zero. Depending on the target, lowering a reduction and a scalar
4611/// comparison separately can cost more than reducing lane-wise comparison
4612/// results. This fold creates the latter form only when it is not costlier.
4613///
4614/// Before:
4615/// %r = call iT @llvm.vector.reduce.or.vNiT(<N x iT> %x)
4616/// %cmp = icmp ne iT %r, 0
4617///
4618/// After:
4619/// %lane.cmp = icmp ne <N x iT> %x, zeroinitializer
4620/// %cmp = call i1 @llvm.vector.reduce.or.vNi1(<N x i1> %lane.cmp)
4621///
4622/// `reduce.or` and `reduce.umax` are non-zero when at least one lane is
4623/// non-zero. Therefore, `icmp ne` uses the existential `reduce.or` test.
4624/// Conversely, `icmp eq` must check that every lane is zero, so it uses the
4625/// universal `reduce.and` test.
4626///
4627/// Before:
4628/// %r = call iT @llvm.vector.reduce.umax.vNiT(<N x iT> %x)
4629/// %cmp = icmp eq iT %r, 0
4630///
4631/// After:
4632/// %lane.cmp = icmp eq <N x iT> %x, zeroinitializer
4633/// %cmp = call i1 @llvm.vector.reduce.and.vNi1(<N x i1> %lane.cmp)
4634bool VectorCombine::foldReductionZeroTest(Instruction &I) {
4635 CmpPredicate Pred;
4636 Value *Op;
4637
4638 if (!match(&I, m_c_ICmp(Pred, m_Value(Op), m_Zero())) ||
4639 !ICmpInst::isEquality(Pred))
4640 return false;
4641
4642 auto *II = dyn_cast<IntrinsicInst>(Op);
4643 if (!II || !II->hasOneUse())
4644 return false;
4645
4646 auto ReduceID = II->getIntrinsicID();
4647 if (ReduceID != Intrinsic::vector_reduce_or &&
4648 ReduceID != Intrinsic::vector_reduce_umax)
4649 return false;
4650
4651 Value *Vec = II->getArgOperand(0);
4652 auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
4653 if (!VecTy || !VecTy->getElementType()->isIntegerTy())
4654 return false;
4655
4656 // Map the scalar zero test to an any-lane or all-lane boolean reduction.
4657 Intrinsic::ID NewIID = (Pred == ICmpInst::ICMP_NE)
4658 ? Intrinsic::vector_reduce_or
4659 : Intrinsic::vector_reduce_and;
4660
4661 // This is not an unconditional canonicalization: compare the cost of the
4662 // original scalar reduction and compare with the vector compare and i1
4663 // reduction replacement for both reduce.or and reduce.umax.
4666
4667 auto *CmpTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
4668 InstructionCost NewCost =
4669 TTI.getCmpSelInstrCost(Instruction::ICmp, VecTy, CmpTy, Pred, CostKind);
4671 getArithmeticReductionInstruction(NewIID), CmpTy, std::nullopt, CostKind);
4672
4673 LLVM_DEBUG(dbgs() << "Found a reduction zero test: " << I << "\n OldCost: "
4674 << OldCost << " vs NewCost: " << NewCost << "\n");
4675
4676 if (!OldCost.isValid() || !NewCost.isValid() || NewCost > OldCost)
4677 return false;
4678
4679 Builder.SetInsertPoint(&I);
4680 Value *NewCmp = Builder.CreateICmp(Pred, Vec, Constant::getNullValue(VecTy));
4681 Value *NewReduce = Builder.CreateIntrinsic(NewIID, {CmpTy}, {NewCmp});
4682 replaceValue(I, *NewReduce);
4683 return true;
4684}
4685
4686/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4687///
4688/// We can prove it for cases when:
4689///
4690/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4691/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4692/// 2. f(x) == 0 <=> x == 0
4693///
4694/// From 1 and 2 (or 1' and 2), we can infer that
4695///
4696/// OP f(X_i) == 0 <=> OP X_i == 0.
4697///
4698/// (1)
4699/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4700/// (2)
4701/// <=> \forall i \in [1, N] X_i == 0
4702/// (1)
4703/// <=> OP(X_i) == 0
4704///
4705/// For some of the OP's and f's, we need to have domain constraints on X
4706/// to ensure properties 1 (or 1') and 2.
4707bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4708 CmpPredicate Pred;
4709 Value *Op;
4710 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4711 !ICmpInst::isEquality(Pred))
4712 return false;
4713
4714 auto *II = dyn_cast<IntrinsicInst>(Op);
4715 if (!II)
4716 return false;
4717
4718 switch (II->getIntrinsicID()) {
4719 case Intrinsic::vector_reduce_add:
4720 case Intrinsic::vector_reduce_or:
4721 case Intrinsic::vector_reduce_umin:
4722 case Intrinsic::vector_reduce_umax:
4723 case Intrinsic::vector_reduce_smin:
4724 case Intrinsic::vector_reduce_smax:
4725 break;
4726 default:
4727 return false;
4728 }
4729
4730 Value *InnerOp = II->getArgOperand(0);
4731
4732 // TODO: fixed vector type might be too restrictive
4733 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4734 return false;
4735
4736 Value *X = nullptr;
4737
4738 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4739 //
4740 // 1. f(x) = shl nuw x, y for arbitrary y
4741 // 2. f(x) = mul nuw x, c for defined c != 0
4742 // 3. f(x) = zext x
4743 // 4. f(x) = sext x
4744 // 5. f(x) = neg x
4745 //
4746 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4747 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4748 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4749 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4750 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4751 ))
4752 return false;
4753
4754 SimplifyQuery S = SQ.getWithInstruction(&I);
4755 auto *XTy = cast<FixedVectorType>(X->getType());
4756
4757 // Check for domain constraints for all supported reductions.
4758 //
4759 // a. OR X_i - has property 1 for every X
4760 // b. UMAX X_i - has property 1 for every X
4761 // c. UMIN X_i - has property 1' for every X
4762 // d. SMAX X_i - has property 1 for X >= 0
4763 // e. SMIN X_i - has property 1' for X >= 0
4764 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4765 //
4766 // In order for the proof to work, we need 1 (or 1') to be true for both
4767 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4768 //
4769 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4770 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4771 // of known bits, we can't reasonably hold knowledge of "either 0
4772 // or negative".
4773 switch (II->getIntrinsicID()) {
4774 case Intrinsic::vector_reduce_add: {
4775 // We need to check that both X_i and f(X_i) have enough leading
4776 // zeros to not overflow.
4777 KnownBits KnownX = computeKnownBits(X, S);
4778 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4779 unsigned NumElems = XTy->getNumElements();
4780 // Adding N elements loses at most ceil(log2(N)) leading bits.
4781 unsigned LostBits = Log2_32_Ceil(NumElems);
4782 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4783 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4784 // Need at least one leading zero left after summation to ensure no overflow
4785 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4786 return false;
4787
4788 // We are not checking whether X or f(X) are positive explicitly because
4789 // we implicitly checked for it when we checked if both cases have enough
4790 // leading zeros to not wrap addition.
4791 break;
4792 }
4793 case Intrinsic::vector_reduce_smin:
4794 case Intrinsic::vector_reduce_smax:
4795 // Check whether X >= 0 and f(X) >= 0
4796 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4797 return false;
4798
4799 break;
4800 default:
4801 break;
4802 };
4803
4804 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4805 << *II << "\n");
4806
4807 // For zext/sext, check if the transform is profitable using cost model.
4808 // For other operations (shl, mul, neg), we're removing an instruction
4809 // while keeping the same reduction type, so it's always profitable.
4810 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4811 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4812 Intrinsic::ID IID = II->getIntrinsicID();
4813
4815 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4817
4818 InstructionCost OldReduceCost, NewReduceCost;
4819 switch (IID) {
4820 case Intrinsic::vector_reduce_add:
4821 case Intrinsic::vector_reduce_or:
4822 OldReduceCost = TTI.getArithmeticReductionCost(
4823 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4824 NewReduceCost = TTI.getArithmeticReductionCost(
4825 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4826 break;
4827 case Intrinsic::vector_reduce_umin:
4828 case Intrinsic::vector_reduce_umax:
4829 case Intrinsic::vector_reduce_smin:
4830 case Intrinsic::vector_reduce_smax:
4831 OldReduceCost = TTI.getMinMaxReductionCost(
4832 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4833 NewReduceCost = TTI.getMinMaxReductionCost(
4834 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4835 break;
4836 default:
4837 llvm_unreachable("Unexpected reduction");
4838 }
4839
4840 InstructionCost OldCost = OldReduceCost + ExtCost;
4841 InstructionCost NewCost =
4842 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4843
4844 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4845 << *InnerOp << "\n OldCost: " << OldCost
4846 << " vs NewCost: " << NewCost << "\n");
4847
4848 // We consider transformation to still be potentially beneficial even
4849 // when the costs are the same because we might remove a use from f(X)
4850 // and unlock other optimizations. Equal costs would just mean that we
4851 // didn't make it worse in the worst case.
4852 if (NewCost > OldCost)
4853 return false;
4854 }
4855
4856 // Since we support zext and sext as f, we might change the scalar type
4857 // of the intrinsic.
4858 Type *Ty = XTy->getScalarType();
4859 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4860 Value *NewCmp =
4861 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4862 replaceValue(I, *NewCmp);
4863 return true;
4864}
4865
4866/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4867/// based on cost, preserving the comparison semantics.
4868///
4869/// We use two fundamental properties for each pair:
4870///
4871/// 1. or(X) == 0 <=> umax(X) == 0
4872/// 2. or(X) == 1 <=> umax(X) == 1
4873/// 3. sign(or(X)) == sign(umax(X))
4874///
4875/// 1. and(X) == -1 <=> umin(X) == -1
4876/// 2. and(X) == -2 <=> umin(X) == -2
4877/// 3. sign(and(X)) == sign(umin(X))
4878///
4879/// From these we can infer the following transformations:
4880/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4881/// b. or(X) s< 0 <-> umax(X) s< 0
4882/// c. or(X) s> -1 <-> umax(X) s> -1
4883/// d. or(X) s< 1 <-> umax(X) s< 1
4884/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4885/// f. or(X) s< 2 <-> umax(X) s< 2
4886/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4887/// h. and(X) s< 0 <-> umin(X) s< 0
4888/// i. and(X) s> -1 <-> umin(X) s> -1
4889/// j. and(X) s> -2 <-> umin(X) s> -2
4890/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4891/// l. and(X) s> -3 <-> umin(X) s> -3
4892///
4893bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4894 CmpPredicate Pred;
4895 Value *ReduceOp;
4896 const APInt *CmpVal;
4897 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4898 return false;
4899
4900 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4901 if (!II || !II->hasOneUse())
4902 return false;
4903
4904 const auto IsValidOrUmaxCmp = [&]() {
4905 // or === umax for i1
4906 if (CmpVal->getBitWidth() == 1)
4907 return true;
4908
4909 // Cases a and e
4910 bool IsEquality =
4911 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4912 // Case c
4913 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4914 // Cases b, d, and f
4915 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4916 Pred == ICmpInst::ICMP_SLT;
4917 return IsEquality || IsPositive || IsNegative;
4918 };
4919
4920 const auto IsValidAndUminCmp = [&]() {
4921 // and === umin for i1
4922 if (CmpVal->getBitWidth() == 1)
4923 return true;
4924
4925 const auto LeadingOnes = CmpVal->countl_one();
4926
4927 // Cases g and k
4928 bool IsEquality =
4929 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4931 // Case h
4932 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4933 // Cases i, j, and l
4934 bool IsPositive =
4935 // if the number has at least N - 2 leading ones
4936 // and the two LSBs are:
4937 // - 1 x 1 -> -1
4938 // - 1 x 0 -> -2
4939 // - 0 x 1 -> -3
4940 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4941 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4942 return IsEquality || IsNegative || IsPositive;
4943 };
4944
4945 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4946 Intrinsic::ID AlternativeIID;
4947
4948 // Check if this is a valid comparison pattern and determine the alternate
4949 // reduction intrinsic.
4950 switch (OriginalIID) {
4951 case Intrinsic::vector_reduce_or:
4952 if (!IsValidOrUmaxCmp())
4953 return false;
4954 AlternativeIID = Intrinsic::vector_reduce_umax;
4955 break;
4956 case Intrinsic::vector_reduce_umax:
4957 if (!IsValidOrUmaxCmp())
4958 return false;
4959 AlternativeIID = Intrinsic::vector_reduce_or;
4960 break;
4961 case Intrinsic::vector_reduce_and:
4962 if (!IsValidAndUminCmp())
4963 return false;
4964 AlternativeIID = Intrinsic::vector_reduce_umin;
4965 break;
4966 case Intrinsic::vector_reduce_umin:
4967 if (!IsValidAndUminCmp())
4968 return false;
4969 AlternativeIID = Intrinsic::vector_reduce_and;
4970 break;
4971 default:
4972 return false;
4973 }
4974
4975 Value *X = II->getArgOperand(0);
4976 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4977 if (!VecTy)
4978 return false;
4979
4980 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4981 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4982 if (ReductionOpc != Instruction::ICmp)
4983 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4984 CostKind);
4986 FastMathFlags(), CostKind);
4987 };
4988
4989 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4990 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4991
4992 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4993 << "\n OrigCost: " << OrigCost
4994 << " vs AltCost: " << AltCost << "\n");
4995
4996 if (AltCost >= OrigCost)
4997 return false;
4998
4999 Builder.SetInsertPoint(&I);
5000 Type *ScalarTy = VecTy->getScalarType();
5001 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
5002 Value *NewCmp =
5003 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
5004
5005 replaceValue(I, *NewCmp);
5006 return true;
5007}
5008
5009/// Used by foldReduceAddCmpZero to check if we can prove that a value is
5010/// non-positive.
5011/// KnownBits cannot see sext <? x i1> as non-positive: each top bit equals a
5012/// single unknown input bit, which a per-bit lattice cannot track. The fold's
5013/// target shape is popcount-style sums of <N x i1> valid/invalid masks (e.g.
5014/// ray-intersection hits) tested for any-hit.
5015/// Previous attempts to approximate the known bits of such expressions were
5016/// using a fully recursive value tracking approach to infer a constant range
5017/// but ultimately turned to be too expensive in compile time.
5018static bool isKnownNonPositive(const Value *V, const SimplifyQuery &SQ,
5019 unsigned Depth = 0) {
5020 constexpr unsigned MaxLocalDepth = 2;
5021 if (Depth > MaxLocalDepth)
5022 return false;
5023
5024 auto NumSignBits = [&](const Value *X) {
5025 return ComputeNumSignBits(X, SQ.DL, SQ.AC, SQ.CxtI, SQ.DT);
5026 };
5027 if (NumSignBits(V) == V->getType()->getScalarSizeInBits())
5028 return true;
5029
5030 Value *A, *B;
5031 if (match(V, m_Add(m_Value(A), m_Value(B))))
5032 return NumSignBits(A) >= 2 && NumSignBits(B) >= 2 &&
5033 isKnownNonPositive(A, SQ, Depth + 1) &&
5034 isKnownNonPositive(B, SQ, Depth + 1);
5035
5036 return computeKnownBits(V, SQ).isNonPositive();
5037}
5038
5039/// Fold (icmp pred (reduce.add X), 0) to (icmp pred' (reduce.or X), 0) when X
5040/// has lanes known to all be non-negative or all non-positive, so that
5041/// sum == 0 iff every lane is 0. Falls back to reduce.umax if reduce.or is
5042/// more expensive on the target.
5043bool VectorCombine::foldReduceAddCmpZero(Instruction &I) {
5044 CmpPredicate Pred;
5045 Value *Vec;
5046 if (!match(&I, m_ICmp(Pred,
5048 m_Value(Vec))),
5049 m_Zero())))
5050 return false;
5051
5052 auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
5053 if (!VecTy || VecTy->getNumElements() < 2)
5054 return false;
5055
5056 SimplifyQuery Q = SQ.getWithInstruction(&I);
5057 bool IsNonNegative = isKnownNonNegative(Vec, Q);
5058 bool IsNonPositive = !IsNonNegative && isKnownNonPositive(Vec, Q);
5059 if (!IsNonNegative && !IsNonPositive)
5060 return false;
5061
5062 // Summing NumElts lanes can consume up to log2(NumElts) sign bits. Require
5063 // strictly more headroom than that so the sum cannot wrap to zero.
5064 unsigned NumElts = VecTy->getNumElements();
5065 unsigned NumSignBits = ComputeNumSignBits(Vec, *DL, SQ.AC, &I, &DT);
5066 if (Log2_32(NumElts) >= NumSignBits)
5067 return false;
5068
5069 ICmpInst::Predicate NewPred;
5070 switch (Pred) {
5071 case ICmpInst::ICMP_EQ:
5072 case ICmpInst::ICMP_ULE:
5073 case ICmpInst::ICMP_SLE:
5074 case ICmpInst::ICMP_SGE:
5075 NewPred = ICmpInst::ICMP_EQ;
5076 break;
5077 case ICmpInst::ICMP_NE:
5078 case ICmpInst::ICMP_UGT:
5079 case ICmpInst::ICMP_SGT:
5080 case ICmpInst::ICMP_SLT:
5081 NewPred = ICmpInst::ICMP_NE;
5082 break;
5083 default:
5084 return false;
5085 }
5086
5087 // SGT and SLE on a non-positive tree, and SLT and SGE on a non-negative
5088 // tree, are tautologies (always true or always false). Leave those to
5089 // InstCombine rather than mapping them here. Remaining signed inequalities
5090 // also need one extra sign bit so the sum cannot flip sign.
5091 if (!IsNonNegative &&
5092 (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE))
5093 return false;
5094 if (!IsNonPositive &&
5095 (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE))
5096 return false;
5097 if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE ||
5098 Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) &&
5099 Log2_32(NumElts) >= NumSignBits - 1)
5100 return false;
5101
5103 Instruction::Add, VecTy, std::nullopt, CostKind);
5105 Instruction::Or, VecTy, std::nullopt, CostKind);
5107 Intrinsic::umax, VecTy, FastMathFlags(), CostKind);
5108 if (!OrCost.isValid() && !UmaxCost.isValid())
5109 return false;
5110 bool UseOr = OrCost.isValid() && (!UmaxCost.isValid() || OrCost <= UmaxCost);
5111 InstructionCost AltCost = UseOr ? OrCost : UmaxCost;
5112 if (AltCost > OrigCost)
5113 return false;
5114
5115 Builder.SetInsertPoint(&I);
5116 Value *NewReduce = UseOr ? Builder.CreateOrReduce(Vec)
5117 : Builder.CreateIntrinsic(
5118 Intrinsic::vector_reduce_umax, {VecTy}, {Vec});
5119 Worklist.pushValue(NewReduce);
5120 Value *NewCmp = Builder.CreateICmp(
5121 NewPred, NewReduce, ConstantInt::getNullValue(VecTy->getScalarType()));
5122 replaceValue(I, *NewCmp);
5123 return true;
5124}
5125
5126/// Returns true if this ShuffleVectorInst eventually feeds into a
5127/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
5128/// chains of shuffles and binary operators (in any combination/order).
5129/// The search does not go deeper than the given Depth.
5131 constexpr unsigned MaxVisited = 32;
5134 bool FoundReduction = false;
5135
5136 WorkList.push_back(SVI);
5137 while (!WorkList.empty()) {
5138 Instruction *I = WorkList.pop_back_val();
5139 for (User *U : I->users()) {
5140 auto *UI = cast<Instruction>(U);
5141 if (!UI || !Visited.insert(UI).second)
5142 continue;
5143 if (Visited.size() > MaxVisited)
5144 return false;
5145 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
5146 // More than one reduction reached
5147 if (FoundReduction)
5148 return false;
5149 switch (II->getIntrinsicID()) {
5150 case Intrinsic::vector_reduce_add:
5151 case Intrinsic::vector_reduce_mul:
5152 case Intrinsic::vector_reduce_and:
5153 case Intrinsic::vector_reduce_or:
5154 case Intrinsic::vector_reduce_xor:
5155 case Intrinsic::vector_reduce_smin:
5156 case Intrinsic::vector_reduce_smax:
5157 case Intrinsic::vector_reduce_umin:
5158 case Intrinsic::vector_reduce_umax:
5159 FoundReduction = true;
5160 continue;
5161 default:
5162 return false;
5163 }
5164 }
5165
5167 return false;
5168
5169 WorkList.emplace_back(UI);
5170 }
5171 }
5172 return FoundReduction;
5173}
5174
5175/// This method looks for groups of shuffles acting on binops, of the form:
5176/// %x = shuffle ...
5177/// %y = shuffle ...
5178/// %a = binop %x, %y
5179/// %b = binop %x, %y
5180/// shuffle %a, %b, selectmask
5181/// We may, especially if the shuffle is wider than legal, be able to convert
5182/// the shuffle to a form where only parts of a and b need to be computed. On
5183/// architectures with no obvious "select" shuffle, this can reduce the total
5184/// number of operations if the target reports them as cheaper.
5185bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
5186 auto *SVI = cast<ShuffleVectorInst>(&I);
5187 auto *VT = cast<FixedVectorType>(I.getType());
5188 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
5189 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
5190 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
5191 VT != Op0->getType())
5192 return false;
5193
5194 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
5195 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
5196 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
5197 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
5198 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
5199 auto checkSVNonOpUses = [&](Instruction *I) {
5200 if (!I || I->getOperand(0)->getType() != VT)
5201 return true;
5202 return any_of(I->users(), [&](User *U) {
5203 return U != Op0 && U != Op1 &&
5204 !(isa<ShuffleVectorInst>(U) &&
5205 (InputShuffles.contains(cast<Instruction>(U)) ||
5206 isInstructionTriviallyDead(cast<Instruction>(U))));
5207 });
5208 };
5209 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
5210 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
5211 return false;
5212
5213 // Collect all the uses that are shuffles that we can transform together. We
5214 // may not have a single shuffle, but a group that can all be transformed
5215 // together profitably.
5217 auto collectShuffles = [&](Instruction *I) {
5218 for (auto *U : I->users()) {
5219 auto *SV = dyn_cast<ShuffleVectorInst>(U);
5220 if (!SV || SV->getType() != VT)
5221 return false;
5222 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
5223 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
5224 return false;
5225 if (!llvm::is_contained(Shuffles, SV))
5226 Shuffles.push_back(SV);
5227 }
5228 return true;
5229 };
5230 if (!collectShuffles(Op0) || !collectShuffles(Op1))
5231 return false;
5232 // From a reduction, we need to be processing a single shuffle, otherwise the
5233 // other uses will not be lane-invariant.
5234 if (FromReduction && Shuffles.size() > 1)
5235 return false;
5236
5237 // Add any shuffle uses for the shuffles we have found, to include them in our
5238 // cost calculations.
5239 if (!FromReduction) {
5240 for (size_t Idx = 0, E = Shuffles.size(); Idx != E; ++Idx) {
5241 for (auto *U : Shuffles[Idx]->users()) {
5242 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
5243 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
5244 Shuffles.push_back(SSV);
5245 }
5246 }
5247 }
5248
5249 // For each of the output shuffles, we try to sort all the first vector
5250 // elements to the beginning, followed by the second array elements at the
5251 // end. If the binops are legalized to smaller vectors, this may reduce total
5252 // number of binops. We compute the ReconstructMask mask needed to convert
5253 // back to the original lane order.
5255 SmallVector<SmallVector<int>> OrigReconstructMasks;
5256 int MaxV1Elt = 0, MaxV2Elt = 0;
5257 unsigned NumElts = VT->getNumElements();
5258 for (ShuffleVectorInst *SVN : Shuffles) {
5259 SmallVector<int> Mask;
5260 SVN->getShuffleMask(Mask);
5261
5262 // Check the operands are the same as the original, or reversed (in which
5263 // case we need to commute the mask).
5264 Value *SVOp0 = SVN->getOperand(0);
5265 Value *SVOp1 = SVN->getOperand(1);
5266 if (isa<UndefValue>(SVOp1)) {
5267 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
5268 SVOp0 = SSV->getOperand(0);
5269 SVOp1 = SSV->getOperand(1);
5270 for (int &Elem : Mask) {
5271 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
5272 return false;
5273 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
5274 }
5275 }
5276 if (SVOp0 == Op1 && SVOp1 == Op0) {
5277 std::swap(SVOp0, SVOp1);
5279 }
5280 if (SVOp0 != Op0 || SVOp1 != Op1)
5281 return false;
5282
5283 // Calculate the reconstruction mask for this shuffle, as the mask needed to
5284 // take the packed values from Op0/Op1 and reconstructing to the original
5285 // order.
5286 SmallVector<int> ReconstructMask;
5287 for (unsigned I = 0; I < Mask.size(); I++) {
5288 if (Mask[I] < 0) {
5289 ReconstructMask.push_back(-1);
5290 } else if (Mask[I] < static_cast<int>(NumElts)) {
5291 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
5292 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
5293 return Mask[I] == A.first;
5294 });
5295 if (It != V1.end())
5296 ReconstructMask.push_back(It - V1.begin());
5297 else {
5298 ReconstructMask.push_back(V1.size());
5299 V1.emplace_back(Mask[I], V1.size());
5300 }
5301 } else {
5302 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
5303 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
5304 return Mask[I] - static_cast<int>(NumElts) == A.first;
5305 });
5306 if (It != V2.end())
5307 ReconstructMask.push_back(NumElts + It - V2.begin());
5308 else {
5309 ReconstructMask.push_back(NumElts + V2.size());
5310 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
5311 }
5312 }
5313 }
5314
5315 // For reductions, we know that the lane ordering out doesn't alter the
5316 // result. In-order can help simplify the shuffle away.
5317 if (FromReduction)
5318 sort(ReconstructMask);
5319 OrigReconstructMasks.push_back(std::move(ReconstructMask));
5320 }
5321
5322 // If the Maximum element used from V1 and V2 are not larger than the new
5323 // vectors, the vectors are already packes and performing the optimization
5324 // again will likely not help any further. This also prevents us from getting
5325 // stuck in a cycle in case the costs do not also rule it out.
5326 if (V1.empty() || V2.empty() ||
5327 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
5328 MaxV2Elt == static_cast<int>(V2.size()) - 1))
5329 return false;
5330
5331 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
5332 // shuffle of another shuffle, or not a shuffle (that is treated like a
5333 // identity shuffle).
5334 auto GetBaseMaskValue = [&](Instruction *I, int M) {
5335 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5336 if (!SV)
5337 return M;
5338 if (isa<UndefValue>(SV->getOperand(1)))
5339 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5340 if (InputShuffles.contains(SSV))
5341 return SSV->getMaskValue(SV->getMaskValue(M));
5342 return SV->getMaskValue(M);
5343 };
5344
5345 // Attempt to sort the inputs my ascending mask values to make simpler input
5346 // shuffles and push complex shuffles down to the uses. We sort on the first
5347 // of the two input shuffle orders, to try and get at least one input into a
5348 // nice order.
5349 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
5350 std::pair<int, int> Y) {
5351 int MXA = GetBaseMaskValue(A, X.first);
5352 int MYA = GetBaseMaskValue(A, Y.first);
5353 return MXA < MYA;
5354 };
5355 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
5356 return SortBase(SVI0A, A, B);
5357 });
5358 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
5359 return SortBase(SVI1A, A, B);
5360 });
5361 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5362 // modified order of the input shuffles.
5363 SmallVector<SmallVector<int>> ReconstructMasks;
5364 for (const auto &Mask : OrigReconstructMasks) {
5365 SmallVector<int> ReconstructMask;
5366 for (int M : Mask) {
5367 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5368 auto It = find_if(V, [M](auto A) { return A.second == M; });
5369 assert(It != V.end() && "Expected all entries in Mask");
5370 return std::distance(V.begin(), It);
5371 };
5372 if (M < 0)
5373 ReconstructMask.push_back(-1);
5374 else if (M < static_cast<int>(NumElts)) {
5375 ReconstructMask.push_back(FindIndex(V1, M));
5376 } else {
5377 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5378 }
5379 }
5380 ReconstructMasks.push_back(std::move(ReconstructMask));
5381 }
5382
5383 // Calculate the masks needed for the new input shuffles, which get padded
5384 // with undef
5385 SmallVector<int> V1A, V1B, V2A, V2B;
5386 for (unsigned I = 0; I < V1.size(); I++) {
5387 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5388 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5389 }
5390 for (unsigned I = 0; I < V2.size(); I++) {
5391 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5392 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5393 }
5394 while (V1A.size() < NumElts) {
5397 }
5398 while (V2A.size() < NumElts) {
5401 }
5402
5403 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5404 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5405 if (!SV)
5406 return C;
5407 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5410 VT, VT, SV->getShuffleMask(), CostKind);
5411 };
5412 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5413 return C +
5415 };
5416
5417 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5418 unsigned MaxVectorSize =
5420 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5421 if (MaxElementsInVector == 0)
5422 return false;
5423 // When there are multiple shufflevector operations on the same input,
5424 // especially when the vector length is larger than the register size,
5425 // identical shuffle patterns may occur across different groups of elements.
5426 // To avoid overestimating the cost by counting these repeated shuffles more
5427 // than once, we only account for unique shuffle patterns. This adjustment
5428 // prevents inflated costs in the cost model for wide vectors split into
5429 // several register-sized groups.
5430 std::set<SmallVector<int, 4>> UniqueShuffles;
5431 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5432 // Compute the cost for performing the shuffle over the full vector.
5433 auto ShuffleCost =
5435 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5436 if (NumFullVectors < 2)
5437 return C + ShuffleCost;
5438 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5439 unsigned NumUniqueGroups = 0;
5440 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5441 // For each group of MaxElementsInVector contiguous elements,
5442 // collect their shuffle pattern and insert into the set of unique patterns.
5443 for (unsigned I = 0; I < NumFullVectors; ++I) {
5444 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5445 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5446 if (UniqueShuffles.insert(SubShuffle).second)
5447 NumUniqueGroups += 1;
5448 }
5449 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5450 };
5451 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5452 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5453 if (!SV)
5454 return C;
5455 SmallVector<int, 16> Mask;
5456 SV->getShuffleMask(Mask);
5457 return AddShuffleMaskAdjustedCost(C, Mask);
5458 };
5459 // Check that input consists of ShuffleVectors applied to the same input
5460 auto AllShufflesHaveSameOperands =
5461 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5462 if (InputShuffles.size() < 2)
5463 return false;
5464 ShuffleVectorInst *FirstSV =
5465 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5466 if (!FirstSV)
5467 return false;
5468
5469 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5470 return std::all_of(
5471 std::next(InputShuffles.begin()), InputShuffles.end(),
5472 [&](Instruction *I) {
5473 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5474 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5475 });
5476 };
5477
5478 // Get the costs of the shuffles + binops before and after with the new
5479 // shuffle masks.
5480 InstructionCost CostBefore =
5481 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5482 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5483 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5484 InstructionCost(0), AddShuffleCost);
5485 if (AllShufflesHaveSameOperands(InputShuffles)) {
5486 UniqueShuffles.clear();
5487 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5488 InstructionCost(0), AddShuffleAdjustedCost);
5489 } else {
5490 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5491 InstructionCost(0), AddShuffleCost);
5492 }
5493
5494 // The new binops will be unused for lanes past the used shuffle lengths.
5495 // These types attempt to get the correct cost for that from the target.
5496 FixedVectorType *Op0SmallVT =
5497 FixedVectorType::get(VT->getScalarType(), V1.size());
5498 FixedVectorType *Op1SmallVT =
5499 FixedVectorType::get(VT->getScalarType(), V2.size());
5500 InstructionCost CostAfter =
5501 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5502 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5503 UniqueShuffles.clear();
5504 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5505 InstructionCost(0), AddShuffleMaskAdjustedCost);
5506 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5507 CostAfter +=
5508 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5509 InstructionCost(0), AddShuffleMaskCost);
5510
5511 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5512 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5513 << " vs CostAfter: " << CostAfter << "\n");
5514 if (CostBefore < CostAfter ||
5515 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5516 return false;
5517
5518 // The cost model has passed, create the new instructions.
5519 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5520 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5521 if (!SV)
5522 return I;
5523 if (isa<UndefValue>(SV->getOperand(1)))
5524 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5525 if (InputShuffles.contains(SSV))
5526 return SSV->getOperand(Op);
5527 return SV->getOperand(Op);
5528 };
5529 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5530 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5531 GetShuffleOperand(SVI0A, 1), V1A);
5532 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5533 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5534 GetShuffleOperand(SVI0B, 1), V1B);
5535 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5536 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5537 GetShuffleOperand(SVI1A, 1), V2A);
5538 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5539 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5540 GetShuffleOperand(SVI1B, 1), V2B);
5541 Builder.SetInsertPoint(Op0);
5542 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5543 NSV0A, NSV0B);
5544 if (auto *I = dyn_cast<Instruction>(NOp0))
5545 I->copyIRFlags(Op0, true);
5546 Builder.SetInsertPoint(Op1);
5547 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5548 NSV1A, NSV1B);
5549 if (auto *I = dyn_cast<Instruction>(NOp1))
5550 I->copyIRFlags(Op1, true);
5551
5552 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5553 Builder.SetInsertPoint(Shuffles[S]);
5554 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5555 replaceValue(*Shuffles[S], *NSV, false);
5556 }
5557
5558 Worklist.pushValue(NSV0A);
5559 Worklist.pushValue(NSV0B);
5560 Worklist.pushValue(NSV1A);
5561 Worklist.pushValue(NSV1B);
5562 return true;
5563}
5564
5565/// Check if instruction depends on ZExt and this ZExt can be moved after the
5566/// instruction. Move ZExt if it is profitable. For example:
5567/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5568/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5569/// Cost model calculations takes into account if zext(x) has other users and
5570/// whether it can be propagated through them too.
5571bool VectorCombine::shrinkType(Instruction &I) {
5572 Value *ZExted, *OtherOperand;
5573 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5574 m_Value(OtherOperand))) &&
5575 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5576 return false;
5577
5578 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5579
5580 auto *BigTy = cast<FixedVectorType>(I.getType());
5581 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5582 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5583
5584 if (I.getOpcode() == Instruction::LShr) {
5585 // Check that the shift amount is less than the number of bits in the
5586 // smaller type. Otherwise, the smaller lshr will return a poison value.
5587 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5588 if (ShAmtKB.getMaxValue().uge(BW))
5589 return false;
5590 } else {
5591 // Check that the expression overall uses at most the same number of bits as
5592 // ZExted
5593 KnownBits KB = computeKnownBits(&I, *DL);
5594 if (KB.countMaxActiveBits() > BW)
5595 return false;
5596 }
5597
5598 // Calculate costs of leaving current IR as it is and moving ZExt operation
5599 // later, along with adding truncates if needed
5601 Instruction::ZExt, BigTy, SmallTy,
5602 TargetTransformInfo::CastContextHint::None, CostKind);
5603 InstructionCost CurrentCost = ZExtCost;
5604 InstructionCost ShrinkCost = 0;
5605
5606 // Calculate total cost and check that we can propagate through all ZExt users
5607 for (User *U : ZExtOperand->users()) {
5608 auto *UI = cast<Instruction>(U);
5609 if (UI == &I) {
5610 CurrentCost +=
5611 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5612 ShrinkCost +=
5613 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5614 ShrinkCost += ZExtCost;
5615 continue;
5616 }
5617
5618 if (!Instruction::isBinaryOp(UI->getOpcode()))
5619 return false;
5620
5621 // Check if we can propagate ZExt through its other users
5622 KnownBits KB = computeKnownBits(UI, *DL);
5623 if (KB.countMaxActiveBits() > BW)
5624 return false;
5625
5626 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5627 ShrinkCost +=
5628 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5629 ShrinkCost += ZExtCost;
5630 }
5631
5632 // If the other instruction operand is not a constant, we'll need to
5633 // generate a truncate instruction. So we have to adjust cost
5634 if (!isa<Constant>(OtherOperand))
5635 ShrinkCost += TTI.getCastInstrCost(
5636 Instruction::Trunc, SmallTy, BigTy,
5637 TargetTransformInfo::CastContextHint::None, CostKind);
5638
5639 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5640 // towards modifying the IR because shrinking opens opportunities for other
5641 // shrinking optimisations.
5642 if (ShrinkCost > CurrentCost)
5643 return false;
5644
5645 Builder.SetInsertPoint(&I);
5646 Value *Op0 = ZExted;
5647 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5648 // Keep the order of operands the same
5649 if (I.getOperand(0) == OtherOperand)
5650 std::swap(Op0, Op1);
5651 Value *NewBinOp =
5652 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5653 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5654 cast<Instruction>(NewBinOp)->copyMetadata(I);
5655 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5656 replaceValue(I, *NewZExtr);
5657 return true;
5658}
5659
5660/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5661/// shuffle (DstVec, SrcVec, Mask)
5662bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5663 Value *DstVec, *SrcVec;
5664 uint64_t ExtIdx, InsIdx;
5665 if (!match(&I,
5666 m_InsertElt(m_Value(DstVec),
5667 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5668 m_ConstantInt(InsIdx))))
5669 return false;
5670
5671 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5672 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5673 // We can try combining vectors with different element sizes.
5674 if (!DstVecTy || !SrcVecTy ||
5675 SrcVecTy->getElementType() != DstVecTy->getElementType())
5676 return false;
5677
5678 unsigned NumDstElts = DstVecTy->getNumElements();
5679 unsigned NumSrcElts = SrcVecTy->getNumElements();
5680 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5681 return false;
5682
5683 // Insertion into poison is a cheaper single operand shuffle.
5685 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5686
5687 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5688 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5689 if (NeedDstSrcSwap) {
5691 Mask[InsIdx] = ExtIdx % NumDstElts;
5692 std::swap(DstVec, SrcVec);
5693 } else {
5695 std::iota(Mask.begin(), Mask.end(), 0);
5696 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5697 }
5698
5699 // Cost
5700 auto *Ins = cast<InsertElementInst>(&I);
5701 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5702 InstructionCost InsCost =
5703 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5704 InstructionCost ExtCost =
5705 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5706 InstructionCost OldCost = ExtCost + InsCost;
5707
5708 InstructionCost NewCost = 0;
5709 SmallVector<int> ExtToVecMask;
5710 if (!NeedExpOrNarrow) {
5711 // Ignore 'free' identity insertion shuffle.
5712 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5713 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5714 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5715 nullptr, {DstVec, SrcVec});
5716 } else {
5717 // When creating a length-changing-vector, always try to keep the relevant
5718 // element in an equivalent position, so that bulk shuffles are more likely
5719 // to be useful.
5720 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5721 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5722 // Add cost for expanding or narrowing
5724 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5725 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5726 }
5727
5728 if (!Ext->hasOneUse())
5729 NewCost += ExtCost;
5730
5731 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5732 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5733 << "\n");
5734
5735 if (OldCost < NewCost)
5736 return false;
5737
5738 if (NeedExpOrNarrow) {
5739 if (!NeedDstSrcSwap)
5740 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5741 else
5742 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5743 }
5744
5745 // Canonicalize undef param to RHS to help further folds.
5746 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5747 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5748 std::swap(DstVec, SrcVec);
5749 }
5750
5751 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5752 replaceValue(I, *Shuf);
5753
5754 return true;
5755}
5756
5757/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5758/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5759/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5760/// before casting it back into `<vscale x 16 x i32>`.
5761bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5762 const APInt *SplatVal0, *SplatVal1;
5764 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5765 return false;
5766
5767 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5768 << "\n");
5769
5770 auto *VTy =
5771 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5772 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5773 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5774
5775 // Just in case the cost of interleave2 intrinsic and bitcast are both
5776 // invalid, in which case we want to bail out, we use <= rather
5777 // than < here. Even they both have valid and equal costs, it's probably
5778 // not a good idea to emit a high-cost constant splat.
5780 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5782 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5783 << *I.getType() << " is too high.\n");
5784 return false;
5785 }
5786
5787 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5788 NewSplatVal <<= Width;
5789 NewSplatVal |= SplatVal0->zext(Width * 2);
5790 auto *NewSplat = ConstantVector::getSplat(
5791 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5792
5793 IRBuilder<> Builder(&I);
5794 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5795 return true;
5796}
5797
5798/// Given this sequence:
5799/// ```
5800/// %d = llvm.vector.deinterleave2 <vscale x 16 x i32> %v
5801/// %f0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d, 0
5802/// %f1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d, 1
5803///
5804/// %low0 = and <vscale x 8 x i32> %f0, splat (i32 65535)
5805/// %low1 = shl <vscale x 8 x i32> %f1, splat (i32 16)
5806/// %merge0 = or disjoint <vscale x 8 x i32> %low0, %low1
5807///
5808/// %high0 = and <vscale x 8 x i32> %f1, splat (i32 -65536)
5809/// %high1 = lshr <vscale x 8 x i32> %f0, splat (i32 16)
5810/// %merge1 = or disjoint <vscale x 8 x i32> %high0, %high1
5811/// ```
5812/// It is actually just de-interleaving a 16-bit vector with double the
5813/// vector length. More generally speaking, it's de-interleaving on a vector
5814/// with half the element width as the original vector.
5815///
5816/// Therefore, we can turn it into:
5817/// ```
5818/// %narrow.v = bitcast <vscale x 16 x i32> %v to <vscale x 32 x i16>
5819/// %d = llvm.vector.deinterleave2 <vscale x 32 x i16> %narrow.v
5820/// %f0 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %d, 0
5821/// %f1 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %d, 1
5822///
5823/// %merge0 = bitcast <vscale x 16 x i16> %f0 to <vscale x 8 x i32>
5824/// %merge1 = bitcast <vscale x 16 x i16> %f1 to <vscale x 8 x i32>
5825/// ```
5826bool VectorCombine::foldDeinterleaveIntrinsics(Instruction &I) {
5827 // This pattern involves bitcast that is not compatible with big endian.
5828 if (DL->isBigEndian())
5829 return false;
5830
5831 using namespace PatternMatch;
5832 Value *DeinterleavedVal;
5833 if (!match(&I, m_Deinterleave2(m_Value(DeinterleavedVal))))
5834 return false;
5835
5836 VectorType *VecTy = cast<VectorType>(DeinterleavedVal->getType());
5837 IntegerType *ElementTy = dyn_cast<IntegerType>(VecTy->getElementType());
5838 if (!ElementTy)
5839 return false;
5840 unsigned ElementWidth = ElementTy->getBitWidth();
5841 if (ElementWidth < 2 || !isPowerOf2_32(ElementWidth))
5842 return false;
5843 unsigned HalfElementWidth = ElementWidth / 2;
5844
5845 if (!I.hasNUses(2))
5846 return false;
5847 std::array<ExtractValueInst *, 2> OrigFields{};
5848 for (User *Usr : I.users()) {
5849 auto *E = dyn_cast<ExtractValueInst>(Usr);
5850 // The deinterleave result can only be used by extractions.
5851 if (!E || E->getNumIndices() != 1)
5852 return false;
5853 unsigned Idx = *E->idx_begin();
5854 // A single field cannot be extracted more than once.
5855 if (Idx >= 2 || OrigFields[Idx] || !E->hasNUses(2))
5856 return false;
5857 OrigFields[Idx] = E;
5858 }
5859
5860 // Find the merge instruction (i.e. OR) first.
5861 SmallVector<Instruction *, 2> MergeInsts;
5862 for (auto *FieldUsr : OrigFields[0]->users()) {
5863 if (!FieldUsr->hasOneUse() || !isa<Instruction>(FieldUsr->user_back()))
5864 return false;
5865 MergeInsts.push_back(cast<Instruction>(FieldUsr->user_back()));
5866 }
5867 assert(MergeInsts.size() == 2);
5868
5869 // Pattern match bottom-up from the merge instructions.
5870 auto MatchMerge = [&](void) -> bool {
5871 APInt LoMask = APInt::getLowBitsSet(ElementWidth, HalfElementWidth);
5872 APInt HiMask = APInt::getHighBitsSet(ElementWidth, HalfElementWidth);
5873 return match(MergeInsts[0],
5874 m_c_Or(m_And(m_Specific(OrigFields[0]), m_SpecificInt(LoMask)),
5875 m_Shl(m_Specific(OrigFields[1]),
5876 m_SpecificInt(HalfElementWidth)))) &&
5877 match(MergeInsts[1],
5878 m_c_Or(m_And(m_Specific(OrigFields[1]), m_SpecificInt(HiMask)),
5879 m_LShr(m_Specific(OrigFields[0]),
5880 m_SpecificInt(HalfElementWidth))));
5881 };
5882 if (!MatchMerge()) {
5883 std::swap(MergeInsts[0], MergeInsts[1]);
5884 if (!MatchMerge())
5885 return false;
5886 }
5887
5888 // Profitability check.
5889 InstructionCost OldCost =
5890 TTI.getInstructionCost(MergeInsts[0], CostKind) +
5891 TTI.getInstructionCost(cast<Instruction>(MergeInsts[0]->getOperand(0)),
5892 CostKind) +
5893 TTI.getInstructionCost(cast<Instruction>(MergeInsts[0]->getOperand(1)),
5894 CostKind);
5895 // There are two fields (assuming SHL has the same cost as LSHR).
5896 OldCost *= 2;
5897
5898 auto *NewFieldTy = VecTy->getWithNewBitWidth(HalfElementWidth);
5899 auto *NewVecTy =
5900 VectorType::getDoubleElementsVectorType(cast<VectorType>(NewFieldTy));
5901 InstructionCost NewCost =
5902 TTI.getCastInstrCost(Instruction::BitCast, VecTy, NewVecTy,
5904 TTI.getCastInstrCost(Instruction::BitCast, NewFieldTy,
5905 MergeInsts[0]->getType(), TTI::CastContextHint::None,
5906 CostKind) *
5907 2;
5908 if (OldCost <= NewCost || !NewCost.isValid()) {
5909 LLVM_DEBUG(
5910 dbgs() << "VC: New deinterleave2 sequence cost (" << NewCost << ")"
5911 << " is higher than that of the old one (" << OldCost << ")\n");
5912 return false;
5913 }
5914
5915 // Do the replacement.
5916 IRBuilder<> Builder(&I);
5917 Value *NewVecCast = Builder.CreateBitCast(DeinterleavedVal, NewVecTy);
5918 Value *NewDeinterleave = Builder.CreateIntrinsic(
5919 Intrinsic::vector_deinterleave2, {NewVecTy}, {NewVecCast});
5920 for (auto [Idx, MergeInst] : enumerate(MergeInsts)) {
5921 Value *NewField = Builder.CreateExtractValue(NewDeinterleave, Idx);
5922 NewField = Builder.CreateBitCast(NewField, MergeInst->getType());
5923 replaceValue(*MergeInst, *NewField);
5924 }
5925
5926 return true;
5927}
5928
5929bool VectorCombine::foldBitcastOfVPLoad(Instruction &I) {
5930 const DataLayout &DL = I.getDataLayout();
5931 auto *Cast = dyn_cast<CastInst>(&I);
5932 if (!Cast || !Cast->isNoopCast(DL) || !isa<VectorType>(Cast->getDestTy()))
5933 return false;
5934
5935 // Fold away bit casts of the loaded value by loading the desired type,
5936 // if the mask is all-ones.
5937 Value *EVL;
5938 auto *II = dyn_cast<VPIntrinsic>(I.getOperand(0));
5940 m_Value(), m_AllOnes(), m_Value(EVL)))))
5941 return false;
5942
5943 VectorType *OrigVecTy = cast<VectorType>(II->getType());
5944 Align OrigAlign =
5945 DL.getValueOrABITypeAlignment(II->getPointerAlignment(), OrigVecTy);
5946 ElementCount OrigVecCnt = OrigVecTy->getElementCount();
5947 VectorType *NewVecTy = cast<VectorType>(Cast->getDestTy());
5948 ElementCount NewVecCnt = NewVecTy->getElementCount();
5949
5950 // Right now we only support cases where the NewVec is longer, because for
5951 // cases where it's shorter, we have to be sure that EVL can be exactly
5952 // divided, otherwise it might yield incorrect results or even page faults
5953 // (if we round-up during the division).
5954 if (!(OrigVecCnt.isScalable() == NewVecCnt.isScalable() &&
5955 NewVecCnt.hasKnownScalarFactor(OrigVecCnt)))
5956 return false;
5957
5958 InstructionCost OldCost =
5959 TTI.getMemIntrinsicInstrCost({Intrinsic::vp_load, OrigVecTy,
5960 II->getMemoryPointerParam(), false,
5961 OrigAlign},
5962 CostKind) +
5963 TTI.getCastInstrCost(Instruction::BitCast, Cast->getType(), OrigVecTy,
5966 {Intrinsic::vp_load, NewVecTy, II->getMemoryPointerParam(), false,
5967 OrigAlign},
5968 CostKind);
5969 LLVM_DEBUG(dbgs() << "foldBitcastOfVPLoad: OldCost=" << OldCost
5970 << " NewCost=" << NewCost << "\n");
5971 if (NewCost > OldCost || !NewCost.isValid())
5972 return false;
5973
5974 unsigned Factor = NewVecCnt.getKnownScalarFactor(OrigVecCnt);
5975 Value *NewEVL = Builder.CreateNUWMul(EVL, Builder.getInt32(Factor));
5976 Value *NewMask = Builder.CreateVectorSplat(NewVecCnt, Builder.getTrue());
5977 CallInst *NewVP = Builder.CreateIntrinsicWithoutFolding(
5978 NewVecTy, Intrinsic::vp_load,
5979 {II->getMemoryPointerParam(), NewMask, NewEVL});
5980 // Preserve the original alignment.
5981 NewVP->addParamAttrs(
5982 0, AttrBuilder(II->getContext()).addAlignmentAttr(OrigAlign));
5983 replaceValue(*Cast, *NewVP);
5984 return true;
5985}
5986
5987/// Fold the following cases into a single byte-level bit-reverse operation
5988/// and accepts bswap and bitreverse intrinsics:
5989/// bswap(bitreverse(x)) --> bitcast(bitreverse(bitcast(x)))
5990/// bitreverse(bswap(x)) --> bitcast(bitreverse(bitcast(x)))
5991bool VectorCombine::foldBitOrderReverseAndSwap(Instruction &I) {
5992 Value *X;
5993 if (!match(&I, m_BitReverse(m_BSwap(m_Value(X)))) &&
5995 return false;
5996
5997 Type *Ty = I.getType();
5998 Type *I8Ty = Builder.getInt8Ty();
5999 TypeSize ElementSize = DL->getTypeStoreSize(Ty);
6000 ElementCount NewVecCnt = ElementCount::get(ElementSize.getKnownMinValue(),
6001 ElementSize.isScalable());
6002 Type *NewVecTy = VectorType::get(I8Ty, NewVecCnt);
6003
6004 auto *II = cast<IntrinsicInst>(&I);
6005 auto *InnerII = cast<IntrinsicInst>(II->getArgOperand(0));
6006 // OldCost = cost of bitreverse/bswap + cost of bswap/bitreverse
6009
6010 // NewCost = cost of bitcast to byte vector +
6011 // cost of bitreverse/bswap on byte vector +
6012 // cost of bitcast back to original type
6013 InstructionCost CastToVecCost = TTI.getCastInstrCost(
6014 Instruction::BitCast, NewVecTy, Ty, TTI::CastContextHint::None, CostKind);
6015 InstructionCost CastToOrigCost = TTI.getCastInstrCost(
6016 Instruction::BitCast, Ty, NewVecTy, TTI::CastContextHint::None, CostKind);
6017
6018 IntrinsicCostAttributes ICANew(Intrinsic::bitreverse, NewVecTy, {NewVecTy});
6019 InstructionCost NewIntrinsicCost =
6021 InstructionCost NewCost = CastToVecCost + NewIntrinsicCost + CastToOrigCost;
6022
6023 if (!InnerII->hasOneUse())
6024 NewCost += TTI.getInstructionCost(InnerII, CostKind);
6025
6026 LLVM_DEBUG(dbgs() << "Found bitorder reverse and swap: " << I
6027 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
6028 << "\n");
6029 if (!NewCost.isValid() || NewCost >= OldCost)
6030 return false;
6031
6032 // Perform transform: bitcast(arg, <N x i8>), bitreverse, bitcast back
6033 Builder.SetInsertPoint(II);
6034 Value *CastToVec = Builder.CreateBitCast(X, NewVecTy);
6035 Value *NewCall =
6036 Builder.CreateUnaryIntrinsic(Intrinsic::bitreverse, CastToVec);
6037 Value *CastToOrig = Builder.CreateBitCast(NewCall, Ty);
6038 replaceValue(I, *CastToOrig);
6039 return true;
6040}
6041
6042// Attempt to shrink loads that are only used by shufflevector instructions.
6043bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
6044 auto *OldLoad = dyn_cast<LoadInst>(&I);
6045 if (!OldLoad || !OldLoad->isSimple())
6046 return false;
6047
6048 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
6049 if (!OldLoadTy)
6050 return false;
6051
6052 unsigned const OldNumElements = OldLoadTy->getNumElements();
6053
6054 // Search all uses of load. If all uses are shufflevector instructions, and
6055 // the second operands are all poison values, find the minimum and maximum
6056 // indices of the vector elements referenced by all shuffle masks.
6057 // Otherwise return `std::nullopt`.
6058 using IndexRange = std::pair<int, int>;
6059 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
6060 IndexRange OutputRange = IndexRange(OldNumElements, -1);
6061 for (llvm::Use &Use : I.uses()) {
6062 // Ensure all uses match the required pattern.
6063 User *Shuffle = Use.getUser();
6064 ArrayRef<int> Mask;
6065
6066 if (!match(Shuffle,
6067 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
6068 return std::nullopt;
6069
6070 // Ignore shufflevector instructions that have no uses.
6071 if (Shuffle->use_empty())
6072 continue;
6073
6074 // Find the min and max indices used by the shufflevector instruction.
6075 for (int Index : Mask) {
6076 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
6077 OutputRange.first = std::min(Index, OutputRange.first);
6078 OutputRange.second = std::max(Index, OutputRange.second);
6079 }
6080 }
6081 }
6082
6083 if (OutputRange.second < OutputRange.first)
6084 return std::nullopt;
6085
6086 return OutputRange;
6087 };
6088
6089 // Get the range of vector elements used by shufflevector instructions.
6090 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
6091 unsigned const NewNumElements = Indices->second + 1u;
6092
6093 // If the range of vector elements is smaller than the full load, attempt
6094 // to create a smaller load.
6095 if (NewNumElements < OldNumElements) {
6096 IRBuilder Builder(&I);
6097 Builder.SetCurrentDebugLocation(I.getDebugLoc());
6098
6099 // Calculate costs of old and new ops.
6100 Type *ElemTy = OldLoadTy->getElementType();
6101 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
6102 Value *PtrOp = OldLoad->getPointerOperand();
6103
6105 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
6106 OldLoad->getPointerAddressSpace(), CostKind);
6107 InstructionCost NewCost =
6108 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
6109 OldLoad->getPointerAddressSpace(), CostKind);
6110
6111 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
6113 unsigned const MaxIndex = NewNumElements * 2u;
6114
6115 for (llvm::Use &Use : I.uses()) {
6116 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
6117
6118 // Ignore shufflevector instructions that have no uses.
6119 if (Shuffle->use_empty())
6120 continue;
6121
6122 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
6123
6124 // Create entry for new use.
6125 NewUses.push_back({Shuffle, OldMask});
6126
6127 // Validate mask indices.
6128 for (int Index : OldMask) {
6129 if (Index >= static_cast<int>(MaxIndex))
6130 return false;
6131 }
6132
6133 // Update costs.
6134 OldCost +=
6136 OldLoadTy, OldMask, CostKind);
6137 NewCost +=
6139 NewLoadTy, OldMask, CostKind);
6140 }
6141
6142 LLVM_DEBUG(
6143 dbgs() << "Found a load used only by shufflevector instructions: "
6144 << I << "\n OldCost: " << OldCost
6145 << " vs NewCost: " << NewCost << "\n");
6146
6147 if (OldCost < NewCost || !NewCost.isValid())
6148 return false;
6149
6150 // Create new load of smaller vector.
6151 auto *NewLoad = cast<LoadInst>(
6152 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
6153 NewLoad->copyMetadata(I);
6154
6155 // Replace all uses.
6156 for (UseEntry &Use : NewUses) {
6157 ShuffleVectorInst *Shuffle = Use.first;
6158 std::vector<int> &NewMask = Use.second;
6159
6160 Builder.SetInsertPoint(Shuffle);
6161 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
6162 Value *NewShuffle = Builder.CreateShuffleVector(
6163 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
6164
6165 replaceValue(*Shuffle, *NewShuffle, false);
6166 }
6167
6168 return true;
6169 }
6170 }
6171 return false;
6172}
6173
6174// Attempt to narrow a phi of shufflevector instructions where the two incoming
6175// values have the same operands but different masks. If the two shuffle masks
6176// are offsets of one another we can use one branch to rotate the incoming
6177// vector and perform one larger shuffle after the phi.
6178bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
6179 auto *Phi = dyn_cast<PHINode>(&I);
6180 if (!Phi || Phi->getNumIncomingValues() != 2u)
6181 return false;
6182
6183 Value *Op = nullptr;
6184 ArrayRef<int> Mask0;
6185 ArrayRef<int> Mask1;
6186
6187 if (!match(Phi->getOperand(0u),
6188 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
6189 !match(Phi->getOperand(1u),
6190 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
6191 return false;
6192
6193 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
6194
6195 // Ensure result vectors are wider than the argument vector.
6196 auto *InputVT = cast<FixedVectorType>(Op->getType());
6197 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
6198 auto const InputNumElements = InputVT->getNumElements();
6199
6200 if (InputNumElements >= ResultVT->getNumElements())
6201 return false;
6202
6203 // Take the difference of the two shuffle masks at each index. Ignore poison
6204 // values at the same index in both masks.
6205 SmallVector<int, 16> NewMask;
6206 NewMask.reserve(Mask0.size());
6207
6208 for (auto [M0, M1] : zip(Mask0, Mask1)) {
6209 if (M0 >= 0 && M1 >= 0)
6210 NewMask.push_back(M0 - M1);
6211 else if (M0 == -1 && M1 == -1)
6212 continue;
6213 else
6214 return false;
6215 }
6216
6217 // Ensure all elements of the new mask are equal. If the difference between
6218 // the incoming mask elements is the same, the two must be constant offsets
6219 // of one another.
6220 if (NewMask.empty() || !all_equal(NewMask))
6221 return false;
6222
6223 // Create new mask using difference of the two incoming masks.
6224 int MaskOffset = NewMask[0u];
6225 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
6226 NewMask.clear();
6227
6228 for (unsigned I = 0u; I < InputNumElements; ++I) {
6229 NewMask.push_back(Index);
6230 Index = (Index + 1u) % InputNumElements;
6231 }
6232
6233 // Calculate costs for worst cases and compare.
6234 auto const Kind = TTI::SK_PermuteSingleSrc;
6235 auto OldCost =
6236 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
6237 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
6238 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
6239 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
6240
6241 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
6242 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
6243 << "\n");
6244
6245 if (NewCost > OldCost)
6246 return false;
6247
6248 // Create new shuffles and narrowed phi.
6249 auto Builder = IRBuilder(Shuf);
6250 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
6251 auto *PoisonVal = PoisonValue::get(InputVT);
6252 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
6253 Worklist.push(cast<Instruction>(NewShuf0));
6254
6255 Builder.SetInsertPoint(Phi);
6256 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
6257 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
6258 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
6259 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
6260
6261 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
6262 PoisonVal = PoisonValue::get(NewPhi->getType());
6263 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
6264
6265 replaceValue(*Phi, *NewShuf1);
6266 return true;
6267}
6268
6269/// This is the entry point for all transforms. Pass manager differences are
6270/// handled in the callers of this function.
6271bool VectorCombine::run() {
6273 return false;
6274
6275 // Don't attempt vectorization if the target does not support vectors.
6276 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
6277 return false;
6278
6279 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
6280
6281 auto FoldInst = [this](Instruction &I) {
6282 Builder.SetInsertPoint(&I);
6283 bool IsVectorType = isa<VectorType>(I.getType());
6284 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
6285 auto Opcode = I.getOpcode();
6286
6287 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
6288
6289 // These folds should be beneficial regardless of when this pass is run
6290 // in the optimization pipeline.
6291 // The type checking is for run-time efficiency. We can avoid wasting time
6292 // dispatching to folding functions if there's no chance of matching.
6293 if (IsFixedVectorType) {
6294 switch (Opcode) {
6295 case Instruction::InsertElement:
6296 if (vectorizeLoadInsert(I))
6297 return true;
6298 break;
6299 case Instruction::ShuffleVector:
6300 if (widenSubvectorLoad(I))
6301 return true;
6302 break;
6303 default:
6304 break;
6305 }
6306 }
6307
6308 // This transform works with scalable and fixed vectors
6309 // TODO: Identify and allow other scalable transforms
6310 if (IsVectorType) {
6311 if (scalarizeOpOrCmp(I))
6312 return true;
6313 if (scalarizeLoad(I))
6314 return true;
6315 if (scalarizeExtExtract(I))
6316 return true;
6317 if (scalarizeVPIntrinsic(I))
6318 return true;
6319 if (foldInterleaveIntrinsics(I))
6320 return true;
6321 if (foldBitcastOfVPLoad(I))
6322 return true;
6323 }
6324
6325 if (foldDeinterleaveIntrinsics(I))
6326 return true;
6327
6328 if (Opcode == Instruction::Store)
6329 if (foldSingleElementStore(I))
6330 return true;
6331
6332 // If this is an early pipeline invocation of this pass, we are done.
6333 if (TryEarlyFoldsOnly)
6334 return false;
6335
6336 if (Opcode == Instruction::Call)
6337 if (foldBitOrderReverseAndSwap(I))
6338 return true;
6339
6340 // Otherwise, try folds that improve codegen but may interfere with
6341 // early IR canonicalizations.
6342 // The type checking is for run-time efficiency. We can avoid wasting time
6343 // dispatching to folding functions if there's no chance of matching.
6344 if (IsFixedVectorType) {
6345 switch (Opcode) {
6346 case Instruction::InsertElement:
6347 if (foldInsExtFNeg(I))
6348 return true;
6349 if (foldInsExtBinop(I))
6350 return true;
6351 if (foldInsExtVectorToShuffle(I))
6352 return true;
6353 break;
6354 case Instruction::ShuffleVector:
6355 if (foldPermuteOfBinops(I))
6356 return true;
6357 if (foldShuffleOfBinops(I))
6358 return true;
6359 if (foldShuffleOfSelects(I))
6360 return true;
6361 if (foldShuffleOfCastops(I))
6362 return true;
6363 if (foldShuffleOfShuffles(I))
6364 return true;
6365 if (foldPermuteOfIntrinsic(I))
6366 return true;
6367 if (foldShufflesOfLengthChangingShuffles(I))
6368 return true;
6369 if (foldShuffleOfIntrinsics(I))
6370 return true;
6371 if (foldSelectShuffle(I))
6372 return true;
6373 if (foldShuffleToIdentity(I))
6374 return true;
6375 break;
6376 case Instruction::Load:
6377 if (shrinkLoadForShuffles(I))
6378 return true;
6379 break;
6380 case Instruction::BitCast:
6381 if (foldBitcastShuffle(I))
6382 return true;
6383 if (foldSelectsFromBitcast(I))
6384 return true;
6385 break;
6386 case Instruction::And:
6387 case Instruction::Or:
6388 case Instruction::Xor:
6389 if (foldBitOpOfCastops(I))
6390 return true;
6391 if (foldBitOpOfCastConstant(I))
6392 return true;
6393 break;
6394 case Instruction::PHI:
6395 if (shrinkPhiOfShuffles(I))
6396 return true;
6397 break;
6398 default:
6399 if (shrinkType(I))
6400 return true;
6401 break;
6402 }
6403 } else {
6404 switch (Opcode) {
6405 case Instruction::Call:
6406 if (foldShuffleFromReductions(I))
6407 return true;
6408 if (foldCastFromReductions(I))
6409 return true;
6410 break;
6411 case Instruction::ExtractElement:
6412 if (foldShuffleChainsToReduce(I))
6413 return true;
6414 break;
6415 case Instruction::ICmp:
6416 if (foldSignBitReductionCmp(I))
6417 return true;
6418 if (foldICmpEqZeroVectorReduce(I))
6419 return true;
6420 if (foldReductionZeroTest(I))
6421 return true;
6422 if (foldEquivalentReductionCmp(I))
6423 return true;
6424 if (foldReduceAddCmpZero(I))
6425 return true;
6426 [[fallthrough]];
6427 case Instruction::FCmp:
6428 if (foldExtractExtract(I))
6429 return true;
6430 break;
6431 case Instruction::Or:
6432 if (foldConcatOfBoolMasks(I))
6433 return true;
6434 [[fallthrough]];
6435 default:
6436 if (Instruction::isBinaryOp(Opcode)) {
6437 if (foldExtractExtract(I))
6438 return true;
6439 if (foldExtractedCmps(I))
6440 return true;
6441 if (foldBinopOfReductions(I))
6442 return true;
6443 }
6444 break;
6445 }
6446 }
6447 return false;
6448 };
6449
6450 bool MadeChange = false;
6451 for (BasicBlock &BB : F) {
6452 // Ignore unreachable basic blocks.
6453 if (!DT.isReachableFromEntry(&BB))
6454 continue;
6455 // Use early increment range so that we can erase instructions in loop.
6456 // make_early_inc_range is not applicable here, as the next iterator may
6457 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
6458 // We manually maintain the next instruction and update it when it is about
6459 // to be deleted.
6460 Instruction *I = &BB.front();
6461 while (I) {
6462 NextInst = I->getNextNode();
6463 if (!I->isDebugOrPseudoInst())
6464 MadeChange |= FoldInst(*I);
6465 I = NextInst;
6466 }
6467 }
6468
6469 NextInst = nullptr;
6470
6471 while (!Worklist.isEmpty()) {
6472 Instruction *I = Worklist.removeOne();
6473 if (!I)
6474 continue;
6475
6478 continue;
6479 }
6480
6481 MadeChange |= FoldInst(*I);
6482 }
6483
6484 return MadeChange;
6485}
6486
6489 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
6491 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
6492 AAResults &AA = FAM.getResult<AAManager>(F);
6493 const DataLayout *DL = &F.getDataLayout();
6496 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, CostKind, TryEarlyFoldsOnly);
6497 if (!Combiner.run())
6498 return PreservedAnalyses::all();
6501 return PA;
6502}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
iv users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1457
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Value * generateNewInstTree(ArrayRef< InstLane > Item, Use *From, FixedVectorType *Ty, const DenseSet< std::pair< Value *, Use * > > &IdentityLeafs, const DenseSet< std::pair< Value *, Use * > > &SplatLeafs, const DenseSet< std::pair< Value *, Use * > > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
std::pair< Value *, int > InstLane
static bool isKnownNonPositive(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Used by foldReduceAddCmpZero to check if we can prove that a value is non-positive.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, const SimplifyQuery &SQ)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static InstLane lookThroughShuffles(Value *V, int Lane)
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1638
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:409
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
void addParamAttrs(unsigned ArgNo, const AttrBuilder &B)
Adds attributes to the indicated argument.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
bool isFPPredicate() const
Definition InstrTypes.h:845
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:33
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:250
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:223
bool empty() const
Definition DenseMap.h:171
iterator end()
Definition DenseMap.h:141
Implements a dense probed hash-table based set.
Definition DenseSet.h:281
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:67
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
LLVM_ABI CallInst * CreateIntrinsicWithoutFolding(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1469
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2617
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2605
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1923
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2664
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:457
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2683
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:221
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2266
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2707
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2008
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2291
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:482
LLVM_ABI Value * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:477
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2498
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2529
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:146
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2702
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2232
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1906
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1511
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2110
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2639
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1570
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1919
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2096
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:577
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1731
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1844
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2474
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1592
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:524
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdempotent() const
Return true if the instruction is idempotent:
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI bool hasAllowReassoc() const LLVM_READONLY
Determine whether the allow-reassociation flag is set.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:348
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:727
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:163
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:993
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:147
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
bool user_empty() const
Definition Value.h:389
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:209
size_type size() const
Definition DenseSet.h:84
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2277
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2282
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_and< Ty... > m_CombineAnd(const Ty &...Ps)
Combine pattern matchers matching all of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0 >::Ty m_BitReverse(const Opnd0 &Op0)
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_Constant()
Match an arbitrary Constant and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
auto m_AnyIntrinsic()
Matches any intrinsic call and ignore it.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0 >::Ty m_BSwap(const Opnd0 &Op0)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
m_Intrinsic_Ty< Opnd >::Ty m_Deinterleave2(const Opnd &Op)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:573
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
void stable_sort(R &&Range)
Definition STLExtras.h:2116
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
RelativeUniformCounterPtr Values
Definition InstrProf.h:91
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:445
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:449
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
const DataLayout & DL
const Instruction * CxtI
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC