LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
39#include <numeric>
40#include <optional>
41#include <queue>
42#include <set>
43
44#define DEBUG_TYPE "vector-combine"
46
47using namespace llvm;
48using namespace llvm::PatternMatch;
49
50STATISTIC(NumVecLoad, "Number of vector loads formed");
51STATISTIC(NumVecCmp, "Number of vector compares formed");
52STATISTIC(NumVecBO, "Number of vector binops formed");
53STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
54STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
55STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
56STATISTIC(NumScalarCmp, "Number of scalar compares formed");
57STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
58
60 "disable-vector-combine", cl::init(false), cl::Hidden,
61 cl::desc("Disable all vector combine transforms"));
62
64 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
65 cl::desc("Disable binop extract to shuffle transforms"));
66
68 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
69 cl::desc("Max number of instructions to scan for vector combining."));
70
71static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
72
73namespace {
74class VectorCombine {
75public:
76 VectorCombine(Function &F, const TargetTransformInfo &TTI,
79 bool TryEarlyFoldsOnly)
80 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
81 DT(DT), AA(AA), DL(DL), CostKind(CostKind),
82 SQ(*DL, /*TLI=*/nullptr, &DT, &AC),
83 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
84
85 bool run();
86
87private:
88 Function &F;
90 const TargetTransformInfo &TTI;
91 const DominatorTree &DT;
92 AAResults &AA;
93 const DataLayout *DL;
94 TTI::TargetCostKind CostKind;
95 const SimplifyQuery SQ;
96
97 /// If true, only perform beneficial early IR transforms. Do not introduce new
98 /// vector operations.
99 bool TryEarlyFoldsOnly;
100
101 InstructionWorklist Worklist;
102
103 /// Next instruction to iterate. It will be updated when it is erased by
104 /// RecursivelyDeleteTriviallyDeadInstructions.
105 Instruction *NextInst;
106
107 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
108 // parameter. That should be updated to specific sub-classes because the
109 // run loop was changed to dispatch on opcode.
110 bool vectorizeLoadInsert(Instruction &I);
111 bool widenSubvectorLoad(Instruction &I);
112 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
113 ExtractElementInst *Ext1,
114 unsigned PreferredExtractIndex) const;
115 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
116 const Instruction &I,
117 ExtractElementInst *&ConvertToShuffle,
118 unsigned PreferredExtractIndex);
119 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 bool foldExtractExtract(Instruction &I);
122 bool foldInsExtFNeg(Instruction &I);
123 bool foldInsExtBinop(Instruction &I);
124 bool foldInsExtVectorToShuffle(Instruction &I);
125 bool foldBitOpOfCastops(Instruction &I);
126 bool foldBitOpOfCastConstant(Instruction &I);
127 bool foldBitcastShuffle(Instruction &I);
128 bool scalarizeOpOrCmp(Instruction &I);
129 bool scalarizeVPIntrinsic(Instruction &I);
130 bool foldExtractedCmps(Instruction &I);
131 bool foldSelectsFromBitcast(Instruction &I);
132 bool foldBinopOfReductions(Instruction &I);
133 bool foldSingleElementStore(Instruction &I);
134 bool scalarizeLoad(Instruction &I);
135 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
136 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeExtExtract(Instruction &I);
138 bool foldConcatOfBoolMasks(Instruction &I);
139 bool foldPermuteOfBinops(Instruction &I);
140 bool foldShuffleOfBinops(Instruction &I);
141 bool foldShuffleOfSelects(Instruction &I);
142 bool foldShuffleOfCastops(Instruction &I);
143 bool foldShuffleOfShuffles(Instruction &I);
144 bool foldPermuteOfIntrinsic(Instruction &I);
145 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
146 bool foldShuffleOfIntrinsics(Instruction &I);
147 bool foldShuffleToIdentity(Instruction &I);
148 bool foldShuffleFromReductions(Instruction &I);
149 bool foldShuffleChainsToReduce(Instruction &I);
150 bool foldCastFromReductions(Instruction &I);
151 bool foldSignBitReductionCmp(Instruction &I);
152 bool foldICmpEqZeroVectorReduce(Instruction &I);
153 bool foldEquivalentReductionCmp(Instruction &I);
154 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
155 bool foldInterleaveIntrinsics(Instruction &I);
156 bool shrinkType(Instruction &I);
157 bool shrinkLoadForShuffles(Instruction &I);
158 bool shrinkPhiOfShuffles(Instruction &I);
159
160 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
161 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
162 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
163 Old.replaceAllUsesWith(&New);
164 if (auto *NewI = dyn_cast<Instruction>(&New)) {
165 New.takeName(&Old);
166 Worklist.pushUsersToWorkList(*NewI);
167 Worklist.pushValue(NewI);
168 }
169 if (Erase && isInstructionTriviallyDead(&Old)) {
170 eraseInstruction(Old);
171 } else {
172 Worklist.push(&Old);
173 }
174 }
175
176 void eraseInstruction(Instruction &I) {
177 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
178 SmallVector<Value *> Ops(I.operands());
179 Worklist.remove(&I);
180 I.eraseFromParent();
181
182 // Push remaining users of the operands and then the operand itself - allows
183 // further folds that were hindered by OneUse limits.
184 SmallPtrSet<Value *, 4> Visited;
185 for (Value *Op : Ops) {
186 if (!Visited.contains(Op)) {
187 if (auto *OpI = dyn_cast<Instruction>(Op)) {
189 OpI, nullptr, nullptr, [&](Value *V) {
190 if (auto *I = dyn_cast<Instruction>(V)) {
191 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
192 Worklist.remove(I);
193 if (I == NextInst)
194 NextInst = NextInst->getNextNode();
195 Visited.insert(I);
196 }
197 }))
198 continue;
199 Worklist.pushUsersToWorkList(*OpI);
200 Worklist.pushValue(OpI);
201 }
202 }
203 }
204 }
205};
206} // namespace
207
208/// Return the source operand of a potentially bitcasted value. If there is no
209/// bitcast, return the input value itself.
211 while (auto *BitCast = dyn_cast<BitCastInst>(V))
212 V = BitCast->getOperand(0);
213 return V;
214}
215
216static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
217 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
218 // The widened load may load data from dirty regions or create data races
219 // non-existent in the source.
220 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
221 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
223 return false;
224
225 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
226 // sure we have all of our type-based constraints in place for this target.
227 Type *ScalarTy = Load->getType()->getScalarType();
228 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
229 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
230 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
231 ScalarSize % 8 != 0)
232 return false;
233
234 return true;
235}
236
237bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
238 // Match insert into fixed vector of scalar value.
239 // TODO: Handle non-zero insert index.
240 Value *Scalar;
241 if (!match(&I,
243 return false;
244
245 // Optionally match an extract from another vector.
246 Value *X;
247 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
248 if (!HasExtract)
249 X = Scalar;
250
251 auto *Load = dyn_cast<LoadInst>(X);
252 if (!canWidenLoad(Load, TTI))
253 return false;
254
255 Type *ScalarTy = Scalar->getType();
256 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
257 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
258
259 // Check safety of replacing the scalar load with a larger vector load.
260 // We use minimal alignment (maximum flexibility) because we only care about
261 // the dereferenceable region. When calculating cost and creating a new op,
262 // we may use a larger value based on alignment attributes.
263 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
264 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
265
266 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
267 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
268 unsigned OffsetEltIndex = 0;
269 Align Alignment = Load->getAlign();
270 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, SQ.AC,
271 SQ.DT)) {
272 // It is not safe to load directly from the pointer, but we can still peek
273 // through gep offsets and check if it safe to load from a base address with
274 // updated alignment. If it is, we can shuffle the element(s) into place
275 // after loading.
276 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
277 APInt Offset(OffsetBitWidth, 0);
279
280 // We want to shuffle the result down from a high element of a vector, so
281 // the offset must be positive.
282 if (Offset.isNegative())
283 return false;
284
285 // The offset must be a multiple of the scalar element to shuffle cleanly
286 // in the element's size.
287 uint64_t ScalarSizeInBytes = ScalarSize / 8;
288 if (Offset.urem(ScalarSizeInBytes) != 0)
289 return false;
290
291 // If we load MinVecNumElts, will our target element still be loaded?
292 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
293 if (OffsetEltIndex >= MinVecNumElts)
294 return false;
295
296 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load,
297 SQ.AC, SQ.DT))
298 return false;
299
300 // Update alignment with offset value. Note that the offset could be negated
301 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
302 // negation does not change the result of the alignment calculation.
303 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
304 }
305
306 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
307 // Use the greater of the alignment on the load or its source pointer.
308 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
309 Type *LoadTy = Load->getType();
310 unsigned AS = Load->getPointerAddressSpace();
311 InstructionCost OldCost =
312 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
313 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
314 OldCost +=
315 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
316 /* Insert */ true, HasExtract, CostKind);
317
318 // New pattern: load VecPtr
319 InstructionCost NewCost =
320 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
321 // Optionally, we are shuffling the loaded vector element(s) into place.
322 // For the mask set everything but element 0 to undef to prevent poison from
323 // propagating from the extra loaded memory. This will also optionally
324 // shrink/grow the vector from the loaded size to the output size.
325 // We assume this operation has no cost in codegen if there was no offset.
326 // Note that we could use freeze to avoid poison problems, but then we might
327 // still need a shuffle to change the vector size.
328 auto *Ty = cast<FixedVectorType>(I.getType());
329 unsigned OutputNumElts = Ty->getNumElements();
330 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
331 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
332 Mask[0] = OffsetEltIndex;
333 if (OffsetEltIndex)
334 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
335 CostKind);
336
337 // We can aggressively convert to the vector form because the backend can
338 // invert this transform if it does not result in a performance win.
339 if (OldCost < NewCost || !NewCost.isValid())
340 return false;
341
342 // It is safe and potentially profitable to load a vector directly:
343 // inselt undef, load Scalar, 0 --> load VecPtr
344 IRBuilder<> Builder(Load);
345 Value *CastedPtr =
346 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
347 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
348 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
349
350 replaceValue(I, *VecLd);
351 ++NumVecLoad;
352 return true;
353}
354
355/// If we are loading a vector and then inserting it into a larger vector with
356/// undefined elements, try to load the larger vector and eliminate the insert.
357/// This removes a shuffle in IR and may allow combining of other loaded values.
358bool VectorCombine::widenSubvectorLoad(Instruction &I) {
359 // Match subvector insert of fixed vector.
360 auto *Shuf = cast<ShuffleVectorInst>(&I);
361 if (!Shuf->isIdentityWithPadding())
362 return false;
363
364 // Allow a non-canonical shuffle mask that is choosing elements from op1.
365 unsigned NumOpElts =
366 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
367 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
368 return M >= (int)(NumOpElts);
369 });
370
371 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
372 if (!canWidenLoad(Load, TTI))
373 return false;
374
375 // We use minimal alignment (maximum flexibility) because we only care about
376 // the dereferenceable region. When calculating cost and creating a new op,
377 // we may use a larger value based on alignment attributes.
378 auto *Ty = cast<FixedVectorType>(I.getType());
379 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
380 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
381 Align Alignment = Load->getAlign();
382 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, SQ.AC,
383 SQ.DT))
384 return false;
385
386 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
387 Type *LoadTy = Load->getType();
388 unsigned AS = Load->getPointerAddressSpace();
389
390 // Original pattern: insert_subvector (load PtrOp)
391 // This conservatively assumes that the cost of a subvector insert into an
392 // undef value is 0. We could add that cost if the cost model accurately
393 // reflects the real cost of that operation.
394 InstructionCost OldCost =
395 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
396
397 // New pattern: load PtrOp
398 InstructionCost NewCost =
399 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
400
401 // We can aggressively convert to the vector form because the backend can
402 // invert this transform if it does not result in a performance win.
403 if (OldCost < NewCost || !NewCost.isValid())
404 return false;
405
406 IRBuilder<> Builder(Load);
407 Value *CastedPtr =
408 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
409 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
410 replaceValue(I, *VecLd);
411 ++NumVecLoad;
412 return true;
413}
414
415/// Determine which, if any, of the inputs should be replaced by a shuffle
416/// followed by extract from a different index.
417ExtractElementInst *VectorCombine::getShuffleExtract(
418 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
419 unsigned PreferredExtractIndex = InvalidIndex) const {
420 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
421 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
422 assert(Index0C && Index1C && "Expected constant extract indexes");
423
424 unsigned Index0 = Index0C->getZExtValue();
425 unsigned Index1 = Index1C->getZExtValue();
426
427 // If the extract indexes are identical, no shuffle is needed.
428 if (Index0 == Index1)
429 return nullptr;
430
431 Type *VecTy = Ext0->getVectorOperand()->getType();
432 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
433 InstructionCost Cost0 =
434 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
435 InstructionCost Cost1 =
436 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
437
438 // If both costs are invalid no shuffle is needed
439 if (!Cost0.isValid() && !Cost1.isValid())
440 return nullptr;
441
442 // We are extracting from 2 different indexes, so one operand must be shuffled
443 // before performing a vector operation and/or extract. The more expensive
444 // extract will be replaced by a shuffle.
445 if (Cost0 > Cost1)
446 return Ext0;
447 if (Cost1 > Cost0)
448 return Ext1;
449
450 // If the costs are equal and there is a preferred extract index, shuffle the
451 // opposite operand.
452 if (PreferredExtractIndex == Index0)
453 return Ext1;
454 if (PreferredExtractIndex == Index1)
455 return Ext0;
456
457 // Otherwise, replace the extract with the higher index.
458 return Index0 > Index1 ? Ext0 : Ext1;
459}
460
461/// Compare the relative costs of 2 extracts followed by scalar operation vs.
462/// vector operation(s) followed by extract. Return true if the existing
463/// instructions are cheaper than a vector alternative. Otherwise, return false
464/// and if one of the extracts should be transformed to a shufflevector, set
465/// \p ConvertToShuffle to that extract instruction.
466bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
467 ExtractElementInst *Ext1,
468 const Instruction &I,
469 ExtractElementInst *&ConvertToShuffle,
470 unsigned PreferredExtractIndex) {
471 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
472 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
473 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
474
475 unsigned Opcode = I.getOpcode();
476 Value *Ext0Src = Ext0->getVectorOperand();
477 Value *Ext1Src = Ext1->getVectorOperand();
478 Type *ScalarTy = Ext0->getType();
479 auto *VecTy = cast<VectorType>(Ext0Src->getType());
480 InstructionCost ScalarOpCost, VectorOpCost;
481
482 // Get cost estimates for scalar and vector versions of the operation.
483 bool IsBinOp = Instruction::isBinaryOp(Opcode);
484 if (IsBinOp) {
485 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
486 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
487 } else {
488 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
489 "Expected a compare");
490 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
491 ScalarOpCost = TTI.getCmpSelInstrCost(
492 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
493 VectorOpCost = TTI.getCmpSelInstrCost(
494 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
495 }
496
497 // Get cost estimates for the extract elements. These costs will factor into
498 // both sequences.
499 unsigned Ext0Index = Ext0IndexC->getZExtValue();
500 unsigned Ext1Index = Ext1IndexC->getZExtValue();
501
502 InstructionCost Extract0Cost =
503 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
504 InstructionCost Extract1Cost =
505 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
506
507 // A more expensive extract will always be replaced by a splat shuffle.
508 // For example, if Ext0 is more expensive:
509 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
510 // extelt (opcode (splat V0, Ext0), V1), Ext1
511 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
512 // check the cost of creating a broadcast shuffle and shuffling both
513 // operands to element 0.
514 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
515 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
516 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
517
518 // Extra uses of the extracts mean that we include those costs in the
519 // vector total because those instructions will not be eliminated.
520 InstructionCost OldCost, NewCost;
521 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
522 // Handle a special case. If the 2 extracts are identical, adjust the
523 // formulas to account for that. The extra use charge allows for either the
524 // CSE'd pattern or an unoptimized form with identical values:
525 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
526 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
527 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
528 OldCost = CheapExtractCost + ScalarOpCost;
529 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
530 } else {
531 // Handle the general case. Each extract is actually a different value:
532 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
533 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
534 NewCost = VectorOpCost + CheapExtractCost +
535 !Ext0->hasOneUse() * Extract0Cost +
536 !Ext1->hasOneUse() * Extract1Cost;
537 }
538
539 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
540 if (ConvertToShuffle) {
541 if (IsBinOp && DisableBinopExtractShuffle)
542 return true;
543
544 // If we are extracting from 2 different indexes, then one operand must be
545 // shuffled before performing the vector operation. The shuffle mask is
546 // poison except for 1 lane that is being translated to the remaining
547 // extraction lane. Therefore, it is a splat shuffle. Ex:
548 // ShufMask = { poison, poison, 0, poison }
549 // TODO: The cost model has an option for a "broadcast" shuffle
550 // (splat-from-element-0), but no option for a more general splat.
551 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
552 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
554 ShuffleMask[BestInsIndex] = BestExtIndex;
556 VecTy, VecTy, ShuffleMask, CostKind, 0,
557 nullptr, {ConvertToShuffle});
558 } else {
560 VecTy, VecTy, {}, CostKind, 0, nullptr,
561 {ConvertToShuffle});
562 }
563 }
564
565 // Aggressively form a vector op if the cost is equal because the transform
566 // may enable further optimization.
567 // Codegen can reverse this transform (scalarize) if it was not profitable.
568 return OldCost < NewCost;
569}
570
571/// Create a shuffle that translates (shifts) 1 element from the input vector
572/// to a new element location.
573static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
574 unsigned NewIndex, IRBuilderBase &Builder) {
575 // The shuffle mask is poison except for 1 lane that is being translated
576 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
577 // ShufMask = { 2, poison, poison, poison }
578 auto *VecTy = cast<FixedVectorType>(Vec->getType());
579 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
580 ShufMask[NewIndex] = OldIndex;
581 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
582}
583
584/// Given an extract element instruction with constant index operand, shuffle
585/// the source vector (shift the scalar element) to a NewIndex for extraction.
586/// Return null if the input can be constant folded, so that we are not creating
587/// unnecessary instructions.
588static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
589 IRBuilderBase &Builder) {
590 // Shufflevectors can only be created for fixed-width vectors.
591 Value *X = ExtElt->getVectorOperand();
592 if (!isa<FixedVectorType>(X->getType()))
593 return nullptr;
594
595 // If the extract can be constant-folded, this code is unsimplified. Defer
596 // to other passes to handle that.
597 Value *C = ExtElt->getIndexOperand();
598 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
599 if (isa<Constant>(X))
600 return nullptr;
601
602 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
603 NewIndex, Builder);
604 return Shuf;
605}
606
607/// Try to reduce extract element costs by converting scalar compares to vector
608/// compares followed by extract.
609/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
610Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
611 Instruction &I) {
612 assert(isa<CmpInst>(&I) && "Expected a compare");
613
614 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
615 // --> extelt (cmp Pred V0, V1), ExtIndex
616 ++NumVecCmp;
617 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
618 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
619 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
620}
621
622/// Try to reduce extract element costs by converting scalar binops to vector
623/// binops followed by extract.
624/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
625Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
626 Instruction &I) {
627 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
628
629 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
630 // --> extelt (bo V0, V1), ExtIndex
631 ++NumVecBO;
632 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
633 V1, "foldExtExtBinop");
634
635 // All IR flags are safe to back-propagate because any potential poison
636 // created in unused vector elements is discarded by the extract.
637 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
638 VecBOInst->copyIRFlags(&I);
639
640 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
641}
642
643/// Match an instruction with extracted vector operands.
644bool VectorCombine::foldExtractExtract(Instruction &I) {
645 // It is not safe to transform things like div, urem, etc. because we may
646 // create undefined behavior when executing those on unknown vector elements.
648 return false;
649
650 Instruction *I0, *I1;
651 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
652 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
654 return false;
655
656 Value *V0, *V1;
657 uint64_t C0, C1;
658 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
659 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
660 V0->getType() != V1->getType())
661 return false;
662
663 // For fixed-width vectors, reject out-of-bounds extract indexes
664 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(V0->getType())) {
665 unsigned NumElts = FixedVecTy->getNumElements();
666 if (C0 >= NumElts || C1 >= NumElts)
667 return false;
668 }
669
670 // If the scalar value 'I' is going to be re-inserted into a vector, then try
671 // to create an extract to that same element. The extract/insert can be
672 // reduced to a "select shuffle".
673 // TODO: If we add a larger pattern match that starts from an insert, this
674 // probably becomes unnecessary.
675 auto *Ext0 = cast<ExtractElementInst>(I0);
676 auto *Ext1 = cast<ExtractElementInst>(I1);
677 uint64_t InsertIndex = InvalidIndex;
678 if (I.hasOneUse())
679 match(I.user_back(),
680 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
681
682 ExtractElementInst *ExtractToChange;
683 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
684 return false;
685
686 Value *ExtOp0 = Ext0->getVectorOperand();
687 Value *ExtOp1 = Ext1->getVectorOperand();
688
689 if (ExtractToChange) {
690 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
691 Value *NewExtOp =
692 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
693 if (!NewExtOp)
694 return false;
695 if (ExtractToChange == Ext0)
696 ExtOp0 = NewExtOp;
697 else
698 ExtOp1 = NewExtOp;
699 }
700
701 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
702 : Ext0->getIndexOperand();
703 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
704 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
705 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
706 Worklist.push(Ext0);
707 Worklist.push(Ext1);
708 replaceValue(I, *NewExt);
709 return true;
710}
711
712/// Try to replace an extract + scalar fneg + insert with a vector fneg +
713/// shuffle.
714bool VectorCombine::foldInsExtFNeg(Instruction &I) {
715 // Match an insert (op (extract)) pattern.
716 Value *DstVec;
717 uint64_t ExtIdx, InsIdx;
718 Instruction *FNeg;
719 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
720 m_ConstantInt(InsIdx))))
721 return false;
722
723 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
724 Value *SrcVec;
725 Instruction *Extract;
726 if (!match(FNeg, m_FNeg(m_CombineAnd(
727 m_Instruction(Extract),
728 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
729 return false;
730
731 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
732 auto *DstVecScalarTy = DstVecTy->getScalarType();
733 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
734 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
735 return false;
736
737 // Ignore if insert/extract index is out of bounds or destination vector has
738 // one element
739 unsigned NumDstElts = DstVecTy->getNumElements();
740 unsigned NumSrcElts = SrcVecTy->getNumElements();
741 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
742 return false;
743
744 // We are inserting the negated element into the same lane that we extracted
745 // from. This is equivalent to a select-shuffle that chooses all but the
746 // negated element from the destination vector.
747 SmallVector<int> Mask(NumDstElts);
748 std::iota(Mask.begin(), Mask.end(), 0);
749 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
750 InstructionCost OldCost =
751 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
752 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
753
754 // If the extract has one use, it will be eliminated, so count it in the
755 // original cost. If it has more than one use, ignore the cost because it will
756 // be the same before/after.
757 if (Extract->hasOneUse())
758 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
759
760 InstructionCost NewCost =
761 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
763 DstVecTy, Mask, CostKind);
764
765 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
766 // If the lengths of the two vectors are not equal,
767 // we need to add a length-change vector. Add this cost.
768 SmallVector<int> SrcMask;
769 if (NeedLenChg) {
770 SrcMask.assign(NumDstElts, PoisonMaskElem);
771 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
773 DstVecTy, SrcVecTy, SrcMask, CostKind);
774 }
775
776 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
777 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
778 << "\n");
779 if (NewCost > OldCost)
780 return false;
781
782 Value *NewShuf, *LenChgShuf = nullptr;
783 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
784 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
785 if (NeedLenChg) {
786 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
787 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
788 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
789 Worklist.pushValue(LenChgShuf);
790 } else {
791 // shuffle DstVec, (fneg SrcVec), Mask
792 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
793 }
794
795 Worklist.pushValue(VecFNeg);
796 replaceValue(I, *NewShuf);
797 return true;
798}
799
800/// Try to fold insert(binop(x,y),binop(a,b),idx)
801/// --> binop(insert(x,a,idx),insert(y,b,idx))
802bool VectorCombine::foldInsExtBinop(Instruction &I) {
803 BinaryOperator *VecBinOp, *SclBinOp;
804 uint64_t Index;
805 if (!match(&I,
806 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
807 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
808 return false;
809
810 // TODO: Add support for addlike etc.
811 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
812 if (BinOpcode != SclBinOp->getOpcode())
813 return false;
814
815 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
816 if (!ResultTy)
817 return false;
818
819 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
820 // shuffle?
821
823 TTI.getInstructionCost(VecBinOp, CostKind) +
825 InstructionCost NewCost =
826 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
827 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
828 Index, VecBinOp->getOperand(0),
829 SclBinOp->getOperand(0)) +
830 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
831 Index, VecBinOp->getOperand(1),
832 SclBinOp->getOperand(1));
833
834 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
835 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
836 << "\n");
837 if (NewCost > OldCost)
838 return false;
839
840 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
841 SclBinOp->getOperand(0), Index);
842 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
843 SclBinOp->getOperand(1), Index);
844 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
845
846 // Intersect flags from the old binops.
847 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
848 NewInst->copyIRFlags(VecBinOp);
849 NewInst->andIRFlags(SclBinOp);
850 }
851
852 Worklist.pushValue(NewIns0);
853 Worklist.pushValue(NewIns1);
854 replaceValue(I, *NewBO);
855 return true;
856}
857
858/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
859/// Supports: bitcast, trunc, sext, zext
860bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
861 // Check if this is a bitwise logic operation
862 auto *BinOp = dyn_cast<BinaryOperator>(&I);
863 if (!BinOp || !BinOp->isBitwiseLogicOp())
864 return false;
865
866 // Get the cast instructions
867 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
868 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
869 if (!LHSCast || !RHSCast) {
870 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
871 return false;
872 }
873
874 // Both casts must be the same type
875 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
876 if (CastOpcode != RHSCast->getOpcode())
877 return false;
878
879 // Only handle supported cast operations
880 switch (CastOpcode) {
881 case Instruction::BitCast:
882 case Instruction::Trunc:
883 case Instruction::SExt:
884 case Instruction::ZExt:
885 break;
886 default:
887 return false;
888 }
889
890 Value *LHSSrc = LHSCast->getOperand(0);
891 Value *RHSSrc = RHSCast->getOperand(0);
892
893 // Source types must match
894 if (LHSSrc->getType() != RHSSrc->getType())
895 return false;
896
897 auto *SrcTy = LHSSrc->getType();
898 auto *DstTy = I.getType();
899 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
900 // Other casts only handle vector types with integer elements.
901 if (CastOpcode != Instruction::BitCast &&
902 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
903 return false;
904
905 // Only integer scalar/vector values are legal for bitwise logic operations.
906 if (!SrcTy->getScalarType()->isIntegerTy() ||
907 !DstTy->getScalarType()->isIntegerTy())
908 return false;
909
910 // Cost Check :
911 // OldCost = bitlogic + 2*casts
912 // NewCost = bitlogic + cast
913
914 // Calculate specific costs for each cast with instruction context
916 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
918 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
919
920 InstructionCost OldCost =
921 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
922 LHSCastCost + RHSCastCost;
923
924 // For new cost, we can't provide an instruction (it doesn't exist yet)
925 InstructionCost GenericCastCost = TTI.getCastInstrCost(
926 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
927
928 InstructionCost NewCost =
929 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
930 GenericCastCost;
931
932 // Account for multi-use casts using specific costs
933 if (!LHSCast->hasOneUse())
934 NewCost += LHSCastCost;
935 if (!RHSCast->hasOneUse())
936 NewCost += RHSCastCost;
937
938 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
939 << " NewCost=" << NewCost << "\n");
940
941 if (NewCost > OldCost)
942 return false;
943
944 // Create the operation on the source type
945 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
946 BinOp->getName() + ".inner");
947 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
948 NewBinOp->copyIRFlags(BinOp);
949
950 Worklist.pushValue(NewOp);
951
952 // Create the cast operation directly to ensure we get a new instruction
953 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
954
955 // Preserve cast instruction flags
956 NewCast->copyIRFlags(LHSCast);
957 NewCast->andIRFlags(RHSCast);
958
959 // Insert the new instruction
960 Value *Result = Builder.Insert(NewCast);
961
962 replaceValue(I, *Result);
963 return true;
964}
965
966/// Match:
967// bitop(castop(x), C) ->
968// bitop(castop(x), castop(InvC)) ->
969// castop(bitop(x, InvC))
970// Supports: bitcast
971bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
973 Constant *C;
974
975 // Check if this is a bitwise logic operation
977 return false;
978
979 // Get the cast instructions
980 auto *LHSCast = dyn_cast<CastInst>(LHS);
981 if (!LHSCast)
982 return false;
983
984 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
985
986 // Only handle supported cast operations
987 switch (CastOpcode) {
988 case Instruction::BitCast:
989 case Instruction::ZExt:
990 case Instruction::SExt:
991 case Instruction::Trunc:
992 break;
993 default:
994 return false;
995 }
996
997 Value *LHSSrc = LHSCast->getOperand(0);
998
999 auto *SrcTy = LHSSrc->getType();
1000 auto *DstTy = I.getType();
1001 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
1002 // Other casts only handle vector types with integer elements.
1003 if (CastOpcode != Instruction::BitCast &&
1004 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
1005 return false;
1006
1007 // Only integer scalar/vector values are legal for bitwise logic operations.
1008 if (!SrcTy->getScalarType()->isIntegerTy() ||
1009 !DstTy->getScalarType()->isIntegerTy())
1010 return false;
1011
1012 // Find the constant InvC, such that castop(InvC) equals to C.
1013 PreservedCastFlags RHSFlags;
1014 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1015 if (!InvC)
1016 return false;
1017
1018 // Cost Check :
1019 // OldCost = bitlogic + cast
1020 // NewCost = bitlogic + cast
1021
1022 // Calculate specific costs for each cast with instruction context
1023 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1024 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1025
1026 InstructionCost OldCost =
1027 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1028
1029 // For new cost, we can't provide an instruction (it doesn't exist yet)
1030 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1031 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1032
1033 InstructionCost NewCost =
1034 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1035 GenericCastCost;
1036
1037 // Account for multi-use casts using specific costs
1038 if (!LHSCast->hasOneUse())
1039 NewCost += LHSCastCost;
1040
1041 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1042 << " NewCost=" << NewCost << "\n");
1043
1044 if (NewCost > OldCost)
1045 return false;
1046
1047 // Create the operation on the source type
1048 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1049 LHSSrc, InvC, I.getName() + ".inner");
1050 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1051 NewBinOp->copyIRFlags(&I);
1052
1053 Worklist.pushValue(NewOp);
1054
1055 // Create the cast operation directly to ensure we get a new instruction
1056 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1057
1058 // Preserve cast instruction flags
1059 if (RHSFlags.NNeg)
1060 NewCast->setNonNeg();
1061 if (RHSFlags.NUW)
1062 NewCast->setHasNoUnsignedWrap();
1063 if (RHSFlags.NSW)
1064 NewCast->setHasNoSignedWrap();
1065
1066 NewCast->andIRFlags(LHSCast);
1067
1068 // Insert the new instruction
1069 Value *Result = Builder.Insert(NewCast);
1070
1071 replaceValue(I, *Result);
1072 return true;
1073}
1074
1075/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1076/// destination type followed by shuffle. This can enable further transforms by
1077/// moving bitcasts or shuffles together.
1078bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1079 Value *V0, *V1;
1080 ArrayRef<int> Mask;
1081 if (!match(&I, m_BitCast(m_OneUse(
1082 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1083 return false;
1084
1085 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1086 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1087 // mask for scalable type is a splat or not.
1088 // 2) Disallow non-vector casts.
1089 // TODO: We could allow any shuffle.
1090 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1091 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1092 if (!DestTy || !SrcTy)
1093 return false;
1094
1095 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1096 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1097 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1098 return false;
1099
1100 bool IsUnary = isa<UndefValue>(V1);
1101
1102 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1103 // if it won't increase the number of bitcasts.
1104 if (!IsUnary) {
1107 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1108 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1109 return false;
1110 }
1111
1112 SmallVector<int, 16> NewMask;
1113 if (DestEltSize <= SrcEltSize) {
1114 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1115 // always be expanded to the equivalent form choosing narrower elements.
1116 if (SrcEltSize % DestEltSize != 0)
1117 return false;
1118 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1119 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1120 } else {
1121 // The bitcast is from narrow elements to wide elements. The shuffle mask
1122 // must choose consecutive elements to allow casting first.
1123 if (DestEltSize % SrcEltSize != 0)
1124 return false;
1125 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1126 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1127 return false;
1128 }
1129
1130 // Bitcast the shuffle src - keep its original width but using the destination
1131 // scalar type.
1132 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1133 auto *NewShuffleTy =
1134 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1135 auto *OldShuffleTy =
1136 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1137 unsigned NumOps = IsUnary ? 1 : 2;
1138
1139 // The new shuffle must not cost more than the old shuffle.
1143
1144 InstructionCost NewCost =
1145 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1146 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1147 TargetTransformInfo::CastContextHint::None,
1148 CostKind));
1149 InstructionCost OldCost =
1150 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1151 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1152 TargetTransformInfo::CastContextHint::None,
1153 CostKind);
1154
1155 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1156 << OldCost << " vs NewCost: " << NewCost << "\n");
1157
1158 if (NewCost > OldCost || !NewCost.isValid())
1159 return false;
1160
1161 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1162 ++NumShufOfBitcast;
1163 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1164 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1165 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1166 replaceValue(I, *Shuf);
1167 return true;
1168}
1169
1170/// VP Intrinsics whose vector operands are both splat values may be simplified
1171/// into the scalar version of the operation and the result splatted. This
1172/// can lead to scalarization down the line.
1173bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1174 if (!isa<VPIntrinsic>(I))
1175 return false;
1176 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1177 Value *Op0 = VPI.getArgOperand(0);
1178 Value *Op1 = VPI.getArgOperand(1);
1179
1180 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1181 return false;
1182
1183 // Check getSplatValue early in this function, to avoid doing unnecessary
1184 // work.
1185 Value *ScalarOp0 = getSplatValue(Op0);
1186 Value *ScalarOp1 = getSplatValue(Op1);
1187 if (!ScalarOp0 || !ScalarOp1)
1188 return false;
1189
1190 // For the binary VP intrinsics supported here, the result on disabled lanes
1191 // is a poison value. For now, only do this simplification if all lanes
1192 // are active.
1193 // TODO: Relax the condition that all lanes are active by using insertelement
1194 // on inactive lanes.
1195 auto IsAllTrueMask = [](Value *MaskVal) {
1196 if (Value *SplattedVal = getSplatValue(MaskVal))
1197 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1198 return ConstValue->isAllOnesValue();
1199 return false;
1200 };
1201 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1202 return false;
1203
1204 // Check to make sure we support scalarization of the intrinsic
1205 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1206 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1207 return false;
1208
1209 // Calculate cost of splatting both operands into vectors and the vector
1210 // intrinsic
1211 VectorType *VecTy = cast<VectorType>(VPI.getType());
1212 SmallVector<int> Mask;
1213 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1214 Mask.resize(FVTy->getNumElements(), 0);
1215 InstructionCost SplatCost =
1216 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1218 CostKind);
1219
1220 // Calculate the cost of the VP Intrinsic
1222 for (Value *V : VPI.args())
1223 Args.push_back(V->getType());
1224 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1225 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1226 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1227
1228 // Determine scalar opcode
1229 std::optional<unsigned> FunctionalOpcode =
1230 VPI.getFunctionalOpcode();
1231 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1232 if (!FunctionalOpcode) {
1233 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1234 if (!ScalarIntrID)
1235 return false;
1236 }
1237
1238 // Calculate cost of scalarizing
1239 InstructionCost ScalarOpCost = 0;
1240 if (ScalarIntrID) {
1241 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1242 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1243 } else {
1244 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1245 VecTy->getScalarType(), CostKind);
1246 }
1247
1248 // The existing splats may be kept around if other instructions use them.
1249 InstructionCost CostToKeepSplats =
1250 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1251 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1252
1253 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1254 << "\n");
1255 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1256 << ", Cost of scalarizing:" << NewCost << "\n");
1257
1258 // We want to scalarize unless the vector variant actually has lower cost.
1259 if (OldCost < NewCost || !NewCost.isValid())
1260 return false;
1261
1262 // Scalarize the intrinsic
1263 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1264 Value *EVL = VPI.getArgOperand(3);
1265
1266 // If the VP op might introduce UB or poison, we can scalarize it provided
1267 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1268 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1269 // scalarizing it.
1270 bool SafeToSpeculate;
1271 if (ScalarIntrID)
1272 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1273 .hasAttribute(Attribute::AttrKind::Speculatable);
1274 else
1276 *FunctionalOpcode, &VPI, nullptr, SQ.AC, SQ.DT);
1277 if (!SafeToSpeculate &&
1278 !isKnownNonZero(EVL, SimplifyQuery(*DL, SQ.DT, SQ.AC, &VPI)))
1279 return false;
1280
1281 Value *ScalarVal =
1282 ScalarIntrID
1283 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1284 {ScalarOp0, ScalarOp1})
1285 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1286 ScalarOp0, ScalarOp1);
1287
1288 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1289 return true;
1290}
1291
1292/// Match a vector op/compare/intrinsic with at least one
1293/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1294/// by insertelement.
1295bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1296 auto *UO = dyn_cast<UnaryOperator>(&I);
1297 auto *BO = dyn_cast<BinaryOperator>(&I);
1298 auto *CI = dyn_cast<CmpInst>(&I);
1299 auto *II = dyn_cast<IntrinsicInst>(&I);
1300 if (!UO && !BO && !CI && !II)
1301 return false;
1302
1303 // TODO: Allow intrinsics with different argument types
1304 if (II) {
1305 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1306 return false;
1307 for (auto [Idx, Arg] : enumerate(II->args()))
1308 if (Arg->getType() != II->getType() &&
1309 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1310 return false;
1311 }
1312
1313 // Do not convert the vector condition of a vector select into a scalar
1314 // condition. That may cause problems for codegen because of differences in
1315 // boolean formats and register-file transfers.
1316 // TODO: Can we account for that in the cost model?
1317 if (CI)
1318 for (User *U : I.users())
1319 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1320 return false;
1321
1322 // Match constant vectors or scalars being inserted into constant vectors:
1323 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1324 SmallVector<Value *> VecCs, ScalarOps;
1325 std::optional<uint64_t> Index;
1326
1327 auto Ops = II ? II->args() : I.operands();
1328 for (auto [OpNum, Op] : enumerate(Ops)) {
1329 Constant *VecC;
1330 Value *V;
1331 uint64_t InsIdx = 0;
1332 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1333 m_ConstantInt(InsIdx)))) {
1334 // Bail if any inserts are out of bounds.
1335 VectorType *OpTy = cast<VectorType>(Op->getType());
1336 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1337 return false;
1338 // All inserts must have the same index.
1339 // TODO: Deal with mismatched index constants and variable indexes?
1340 if (!Index)
1341 Index = InsIdx;
1342 else if (InsIdx != *Index)
1343 return false;
1344 VecCs.push_back(VecC);
1345 ScalarOps.push_back(V);
1346 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1347 OpNum, &TTI)) {
1348 VecCs.push_back(Op.get());
1349 ScalarOps.push_back(Op.get());
1350 } else if (match(Op.get(), m_Constant(VecC))) {
1351 VecCs.push_back(VecC);
1352 ScalarOps.push_back(nullptr);
1353 } else {
1354 return false;
1355 }
1356 }
1357
1358 // Bail if all operands are constant.
1359 if (!Index.has_value())
1360 return false;
1361
1362 VectorType *VecTy = cast<VectorType>(I.getType());
1363 Type *ScalarTy = VecTy->getScalarType();
1364 assert(VecTy->isVectorTy() &&
1365 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1366 ScalarTy->isPointerTy()) &&
1367 "Unexpected types for insert element into binop or cmp");
1368
1369 unsigned Opcode = I.getOpcode();
1370 InstructionCost ScalarOpCost, VectorOpCost;
1371 if (CI) {
1372 CmpInst::Predicate Pred = CI->getPredicate();
1373 ScalarOpCost = TTI.getCmpSelInstrCost(
1374 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1375 VectorOpCost = TTI.getCmpSelInstrCost(
1376 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1377 } else if (UO || BO) {
1378 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1379 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1380 } else {
1381 IntrinsicCostAttributes ScalarICA(
1382 II->getIntrinsicID(), ScalarTy,
1383 SmallVector<Type *>(II->arg_size(), ScalarTy));
1384 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1385 IntrinsicCostAttributes VectorICA(
1386 II->getIntrinsicID(), VecTy,
1387 SmallVector<Type *>(II->arg_size(), VecTy));
1388 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1389 }
1390
1391 // Fold the vector constants in the original vectors into a new base vector to
1392 // get more accurate cost modelling.
1393 Value *NewVecC = nullptr;
1394 if (CI)
1395 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1396 else if (UO)
1397 NewVecC =
1398 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1399 else if (BO)
1400 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1401 else if (II)
1402 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1403
1404 if (!NewVecC)
1405 return false;
1406
1407 // Get cost estimate for the insert element. This cost will factor into
1408 // both sequences.
1409 InstructionCost OldCost = VectorOpCost;
1410 InstructionCost NewCost =
1411 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1412 CostKind, *Index, NewVecC);
1413
1414 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1415 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1416 II->getIntrinsicID(), Idx, &TTI)))
1417 continue;
1419 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1420 OldCost += InsertCost;
1421 NewCost += !Op->hasOneUse() * InsertCost;
1422 }
1423
1424 // We want to scalarize unless the vector variant actually has lower cost.
1425 if (OldCost < NewCost || !NewCost.isValid())
1426 return false;
1427
1428 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1429 // inselt NewVecC, (scalar_op V0, V1), Index
1430 if (CI)
1431 ++NumScalarCmp;
1432 else if (UO || BO)
1433 ++NumScalarOps;
1434 else
1435 ++NumScalarIntrinsic;
1436
1437 // For constant cases, extract the scalar element, this should constant fold.
1438 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1439 if (!Scalar)
1441 cast<Constant>(VecC), Builder.getInt64(*Index));
1442
1443 Value *Scalar;
1444 if (CI)
1445 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1446 else if (UO || BO)
1447 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1448 else
1449 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1450
1451 Scalar->setName(I.getName() + ".scalar");
1452
1453 // All IR flags are safe to back-propagate. There is no potential for extra
1454 // poison to be created by the scalar instruction.
1455 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1456 ScalarInst->copyIRFlags(&I);
1457
1458 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1459 replaceValue(I, *Insert);
1460 return true;
1461}
1462
1463/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1464/// a vector into vector operations followed by extract. Note: The SLP pass
1465/// may miss this pattern because of implementation problems.
1466bool VectorCombine::foldExtractedCmps(Instruction &I) {
1467 auto *BI = dyn_cast<BinaryOperator>(&I);
1468
1469 // We are looking for a scalar binop of booleans.
1470 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1471 if (!BI || !I.getType()->isIntegerTy(1))
1472 return false;
1473
1474 // The compare predicates should match, and each compare should have a
1475 // constant operand.
1476 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1477 Instruction *I0, *I1;
1478 Constant *C0, *C1;
1479 CmpPredicate P0, P1;
1480 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1481 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1482 return false;
1483
1484 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1485 if (!MatchingPred)
1486 return false;
1487
1488 // The compare operands must be extracts of the same vector with constant
1489 // extract indexes.
1490 Value *X;
1491 uint64_t Index0, Index1;
1492 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1493 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1494 return false;
1495
1496 auto *Ext0 = cast<ExtractElementInst>(I0);
1497 auto *Ext1 = cast<ExtractElementInst>(I1);
1498 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1499 if (!ConvertToShuf)
1500 return false;
1501 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1502 "Unknown ExtractElementInst");
1503
1504 // The original scalar pattern is:
1505 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1506 CmpInst::Predicate Pred = *MatchingPred;
1507 unsigned CmpOpcode =
1508 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1509 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1510 if (!VecTy)
1511 return false;
1512
1513 InstructionCost Ext0Cost =
1514 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1515 InstructionCost Ext1Cost =
1516 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1518 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1519 CostKind);
1520
1521 InstructionCost OldCost =
1522 Ext0Cost + Ext1Cost + CmpCost * 2 +
1523 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1524
1525 // The proposed vector pattern is:
1526 // vcmp = cmp Pred X, VecC
1527 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1528 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1529 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1532 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1533 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1534 ShufMask[CheapIndex] = ExpensiveIndex;
1536 CmpTy, ShufMask, CostKind);
1537 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1538 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1539 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1540 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1541
1542 // Aggressively form vector ops if the cost is equal because the transform
1543 // may enable further optimization.
1544 // Codegen can reverse this transform (scalarize) if it was not profitable.
1545 if (OldCost < NewCost || !NewCost.isValid())
1546 return false;
1547
1548 // Create a vector constant from the 2 scalar constants.
1549 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1550 PoisonValue::get(VecTy->getElementType()));
1551 CmpC[Index0] = C0;
1552 CmpC[Index1] = C1;
1553 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1554 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1555 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1556 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1557 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1558 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1559 replaceValue(I, *NewExt);
1560 ++NumVecCmpBO;
1561 return true;
1562}
1563
1564/// Try to fold scalar selects that select between extracted elements and zero
1565/// into extracting from a vector select. This is rooted at the bitcast.
1566///
1567/// This pattern arises when a vector is bitcast to a smaller element type,
1568/// elements are extracted, and then conditionally selected with zero:
1569///
1570/// %bc = bitcast <4 x i32> %src to <16 x i8>
1571/// %e0 = extractelement <16 x i8> %bc, i32 0
1572/// %s0 = select i1 %cond, i8 %e0, i8 0
1573/// %e1 = extractelement <16 x i8> %bc, i32 1
1574/// %s1 = select i1 %cond, i8 %e1, i8 0
1575/// ...
1576///
1577/// Transforms to:
1578/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1579/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1580/// %e0 = extractelement <16 x i8> %bc, i32 0
1581/// %e1 = extractelement <16 x i8> %bc, i32 1
1582/// ...
1583///
1584/// This is profitable because vector select on wider types produces fewer
1585/// select/cndmask instructions than scalar selects on each element.
1586bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1587 auto *BC = dyn_cast<BitCastInst>(&I);
1588 if (!BC)
1589 return false;
1590
1591 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1592 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1593 if (!SrcVecTy || !DstVecTy)
1594 return false;
1595
1596 // Source must be 32-bit or 64-bit elements, destination must be smaller
1597 // integer elements. Zero in all these types is all-bits-zero.
1598 Type *SrcEltTy = SrcVecTy->getElementType();
1599 Type *DstEltTy = DstVecTy->getElementType();
1600 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1601 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1602
1603 if (SrcEltBits != 32 && SrcEltBits != 64)
1604 return false;
1605
1606 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1607 return false;
1608
1609 // Check profitability using TTI before collecting users.
1610 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1611 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1612
1613 InstructionCost ScalarSelCost =
1614 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1616 InstructionCost VecSelCost =
1617 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1619
1620 // We need at least this many selects for vectorization to be profitable.
1621 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1622 // ScalarSelCost
1623 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1624 return false;
1625
1626 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1627
1628 // Quick check: if bitcast doesn't have enough users, bail early.
1629 if (!BC->hasNUsesOrMore(MinSelects))
1630 return false;
1631
1632 // Collect all select users that match the pattern, grouped by condition.
1633 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1634 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1635
1636 for (User *U : BC->users()) {
1637 auto *Ext = dyn_cast<ExtractElementInst>(U);
1638 if (!Ext)
1639 continue;
1640
1641 for (User *ExtUser : Ext->users()) {
1642 Value *Cond;
1643 // Match: select i1 %cond, %ext, 0
1644 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1645 Cond->getType()->isIntegerTy(1))
1646 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1647 }
1648 }
1649
1650 if (CondToSelects.empty())
1651 return false;
1652
1653 bool MadeChange = false;
1654 Value *SrcVec = BC->getOperand(0);
1655
1656 // Process each group of selects with the same condition.
1657 for (auto [Cond, Selects] : CondToSelects) {
1658 // Only profitable if vector select cost < total scalar select cost.
1659 if (Selects.size() < MinSelects) {
1660 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1661 << "profitable (VecCost=" << VecSelCost
1662 << ", ScalarCost=" << ScalarSelCost
1663 << ", NumSelects=" << Selects.size() << ")\n");
1664 continue;
1665 }
1666
1667 // Create the vector select and bitcast once for this condition.
1668 auto InsertPt = std::next(BC->getIterator());
1669
1670 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1671 if (DT.dominates(BC, CondInst))
1672 InsertPt = std::next(CondInst->getIterator());
1673
1674 Builder.SetInsertPoint(InsertPt);
1675 Value *VecSel =
1676 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1677 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1678
1679 // Replace each scalar select with an extract from the new bitcast.
1680 for (SelectInst *Sel : Selects) {
1681 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1682 Value *Idx = Ext->getIndexOperand();
1683
1684 Builder.SetInsertPoint(Sel);
1685 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1686 replaceValue(*Sel, *NewExt);
1687 MadeChange = true;
1688 }
1689
1690 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1691 << " selects into vector select\n");
1692 }
1693
1694 return MadeChange;
1695}
1696
1699 const TargetTransformInfo &TTI,
1700 InstructionCost &CostBeforeReduction,
1701 InstructionCost &CostAfterReduction) {
1702 Instruction *Op0, *Op1;
1703 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1704 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1705 unsigned ReductionOpc =
1706 getArithmeticReductionInstruction(II.getIntrinsicID());
1707 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1708 bool IsUnsigned = isa<ZExtInst>(RedOp);
1709 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1710
1711 CostBeforeReduction =
1712 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1714 CostAfterReduction =
1715 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1716 ExtType, FastMathFlags(), CostKind);
1717 return;
1718 }
1719 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1720 match(RedOp,
1722 match(Op0, m_ZExtOrSExt(m_Value())) &&
1723 Op0->getOpcode() == Op1->getOpcode() &&
1724 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1725 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1726 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1727 bool IsUnsigned = isa<ZExtInst>(Op0);
1728 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1729 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1730
1731 InstructionCost ExtCost =
1732 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1734 InstructionCost MulCost =
1735 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1736 InstructionCost Ext2Cost =
1737 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1739
1740 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1741 CostAfterReduction = TTI.getMulAccReductionCost(
1742 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1743 return;
1744 }
1745 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1746 std::nullopt, CostKind);
1747}
1748
1749bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1750 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1751 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1752 if (BinOpOpc == Instruction::Sub)
1753 ReductionIID = Intrinsic::vector_reduce_add;
1754 if (ReductionIID == Intrinsic::not_intrinsic)
1755 return false;
1756
1757 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1758 Intrinsic::ID IID) -> Value * {
1759 auto *II = dyn_cast<IntrinsicInst>(V);
1760 if (!II)
1761 return nullptr;
1762 if (II->getIntrinsicID() == IID && II->hasOneUse())
1763 return II->getArgOperand(0);
1764 return nullptr;
1765 };
1766
1767 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1768 if (!V0)
1769 return false;
1770 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1771 if (!V1)
1772 return false;
1773
1774 auto *VTy = cast<VectorType>(V0->getType());
1775 if (V1->getType() != VTy)
1776 return false;
1777 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1778 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1779 unsigned ReductionOpc =
1780 getArithmeticReductionInstruction(II0.getIntrinsicID());
1781
1782 InstructionCost OldCost = 0;
1783 InstructionCost NewCost = 0;
1784 InstructionCost CostOfRedOperand0 = 0;
1785 InstructionCost CostOfRed0 = 0;
1786 InstructionCost CostOfRedOperand1 = 0;
1787 InstructionCost CostOfRed1 = 0;
1788 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1789 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1790 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1791 NewCost =
1792 CostOfRedOperand0 + CostOfRedOperand1 +
1793 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1794 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1795 if (NewCost >= OldCost || !NewCost.isValid())
1796 return false;
1797
1798 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1799 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1800 << "\n");
1801 Value *VectorBO;
1802 if (BinOpOpc == Instruction::Or)
1803 VectorBO = Builder.CreateOr(V0, V1, "",
1804 cast<PossiblyDisjointInst>(I).isDisjoint());
1805 else
1806 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1807
1808 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1809 replaceValue(I, *Rdx);
1810 return true;
1811}
1812
1813// Check if memory loc modified between two instrs in the same BB
1816 const MemoryLocation &Loc, AAResults &AA) {
1817 unsigned NumScanned = 0;
1818 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1819 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1820 ++NumScanned > MaxInstrsToScan;
1821 });
1822}
1823
1824namespace {
1825/// Helper class to indicate whether a vector index can be safely scalarized and
1826/// if a freeze needs to be inserted.
1827class ScalarizationResult {
1828 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1829
1830 StatusTy Status;
1831 Value *ToFreeze;
1832
1833 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1834 : Status(Status), ToFreeze(ToFreeze) {}
1835
1836public:
1837 ScalarizationResult(const ScalarizationResult &Other) = default;
1838 ~ScalarizationResult() {
1839 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1840 }
1841
1842 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1843 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1844 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1845 return {StatusTy::SafeWithFreeze, ToFreeze};
1846 }
1847
1848 /// Returns true if the index can be scalarize without requiring a freeze.
1849 bool isSafe() const { return Status == StatusTy::Safe; }
1850 /// Returns true if the index cannot be scalarized.
1851 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1852 /// Returns true if the index can be scalarize, but requires inserting a
1853 /// freeze.
1854 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1855
1856 /// Reset the state of Unsafe and clear ToFreze if set.
1857 void discard() {
1858 ToFreeze = nullptr;
1859 Status = StatusTy::Unsafe;
1860 }
1861
1862 /// Freeze the ToFreeze and update the use in \p User to use it.
1863 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1864 assert(isSafeWithFreeze() &&
1865 "should only be used when freezing is required");
1866 assert(is_contained(ToFreeze->users(), &UserI) &&
1867 "UserI must be a user of ToFreeze");
1868 IRBuilder<>::InsertPointGuard Guard(Builder);
1869 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1870 Value *Frozen =
1871 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1872 for (Use &U : make_early_inc_range((UserI.operands())))
1873 if (U.get() == ToFreeze)
1874 U.set(Frozen);
1875
1876 ToFreeze = nullptr;
1877 }
1878};
1879} // namespace
1880
1881/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1882/// Idx. \p Idx must access a valid vector element.
1883static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1884 const SimplifyQuery &SQ) {
1885 // We do checks for both fixed vector types and scalable vector types.
1886 // This is the number of elements of fixed vector types,
1887 // or the minimum number of elements of scalable vector types.
1888 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1889 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1890
1891 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1892 if (C->getValue().ult(NumElements))
1893 return ScalarizationResult::safe();
1894 return ScalarizationResult::unsafe();
1895 }
1896
1897 // Always unsafe if the index type can't handle all inbound values.
1898 if (!llvm::isUIntN(IntWidth, NumElements))
1899 return ScalarizationResult::unsafe();
1900
1901 APInt Zero(IntWidth, 0);
1902 APInt MaxElts(IntWidth, NumElements);
1903 ConstantRange ValidIndices(Zero, MaxElts);
1904 ConstantRange IdxRange(IntWidth, true);
1905
1906 if (isGuaranteedNotToBePoison(Idx, SQ.AC, SQ.CxtI, SQ.DT)) {
1907 if (ValidIndices.contains(
1908 computeConstantRange(Idx, /*ForSigned=*/false, SQ)))
1909 return ScalarizationResult::safe();
1910 return ScalarizationResult::unsafe();
1911 }
1912
1913 // If the index may be poison, check if we can insert a freeze before the
1914 // range of the index is restricted.
1915 Value *IdxBase;
1916 ConstantInt *CI;
1917 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1918 IdxRange = IdxRange.binaryAnd(CI->getValue());
1919 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1920 IdxRange = IdxRange.urem(CI->getValue());
1921 }
1922
1923 if (ValidIndices.contains(IdxRange))
1924 return ScalarizationResult::safeWithFreeze(IdxBase);
1925 return ScalarizationResult::unsafe();
1926}
1927
1928/// The memory operation on a vector of \p ScalarType had alignment of
1929/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1930/// alignment that will be valid for the memory operation on a single scalar
1931/// element of the same type with index \p Idx.
1933 Type *ScalarType, Value *Idx,
1934 const DataLayout &DL) {
1935 if (auto *C = dyn_cast<ConstantInt>(Idx))
1936 return commonAlignment(VectorAlignment,
1937 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1938 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1939}
1940
1941// Combine patterns like:
1942// %0 = load <4 x i32>, <4 x i32>* %a
1943// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1944// store <4 x i32> %1, <4 x i32>* %a
1945// to:
1946// %0 = bitcast <4 x i32>* %a to i32*
1947// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1948// store i32 %b, i32* %1
1949bool VectorCombine::foldSingleElementStore(Instruction &I) {
1951 return false;
1952 auto *SI = cast<StoreInst>(&I);
1953 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1954 return false;
1955
1956 // TODO: Combine more complicated patterns (multiple insert) by referencing
1957 // TargetTransformInfo.
1959 Value *NewElement;
1960 Value *Idx;
1961 if (!match(SI->getValueOperand(),
1962 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1963 m_Value(Idx))))
1964 return false;
1965
1966 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1967 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1968 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1969 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1970 // modified between, vector type matches store size, and index is inbounds.
1971 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1972 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1973 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1974 return false;
1975
1976 auto ScalarizableIdx =
1977 canScalarizeAccess(VecTy, Idx, SQ.getWithInstruction(Load));
1978 if (ScalarizableIdx.isUnsafe() ||
1979 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1980 MemoryLocation::get(SI), AA))
1981 return false;
1982
1983 // Ensure we add the load back to the worklist BEFORE its users so they can
1984 // erased in the correct order.
1985 Worklist.push(Load);
1986
1987 if (ScalarizableIdx.isSafeWithFreeze())
1988 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1989 Value *GEP = Builder.CreateInBoundsGEP(
1990 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1991 {ConstantInt::get(Idx->getType(), 0), Idx});
1992 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1993 NSI->copyMetadata(*SI);
1994 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1995 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1996 *DL);
1997 NSI->setAlignment(ScalarOpAlignment);
1998 replaceValue(I, *NSI);
2000 return true;
2001 }
2002
2003 return false;
2004}
2005
2006/// Try to scalarize vector loads feeding extractelement or bitcast
2007/// instructions.
2008bool VectorCombine::scalarizeLoad(Instruction &I) {
2009 Value *Ptr;
2010 if (!match(&I, m_Load(m_Value(Ptr))))
2011 return false;
2012
2013 auto *LI = cast<LoadInst>(&I);
2014 auto *VecTy = cast<VectorType>(LI->getType());
2015 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2016 return false;
2017
2018 bool AllExtracts = true;
2019 bool AllBitcasts = true;
2020 Instruction *LastCheckedInst = LI;
2021 unsigned NumInstChecked = 0;
2022
2023 // Check what type of users we have (must either all be extracts or
2024 // bitcasts) and ensure no memory modifications between the load and
2025 // its users.
2026 for (User *U : LI->users()) {
2027 auto *UI = dyn_cast<Instruction>(U);
2028 if (!UI || UI->getParent() != LI->getParent())
2029 return false;
2030
2031 // If any user is waiting to be erased, then bail out as this will
2032 // distort the cost calculation and possibly lead to infinite loops.
2033 if (UI->use_empty())
2034 return false;
2035
2036 if (!isa<ExtractElementInst>(UI))
2037 AllExtracts = false;
2038 if (!isa<BitCastInst>(UI))
2039 AllBitcasts = false;
2040
2041 // Check if any instruction between the load and the user may modify memory.
2042 if (LastCheckedInst->comesBefore(UI)) {
2043 for (Instruction &I :
2044 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2045 // Bail out if we reached the check limit or the instruction may write
2046 // to memory.
2047 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2048 return false;
2049 NumInstChecked++;
2050 }
2051 LastCheckedInst = UI;
2052 }
2053 }
2054
2055 if (AllExtracts)
2056 return scalarizeLoadExtract(LI, VecTy, Ptr);
2057 if (AllBitcasts)
2058 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2059 return false;
2060}
2061
2062/// Try to scalarize vector loads feeding extractelement instructions.
2063bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2064 Value *Ptr) {
2066 return false;
2067
2068 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2069 llvm::scope_exit FailureGuard([&]() {
2070 // If the transform is aborted, discard the ScalarizationResults.
2071 for (auto &Pair : NeedFreeze)
2072 Pair.second.discard();
2073 });
2074
2075 InstructionCost OriginalCost =
2076 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2078 InstructionCost ScalarizedCost = 0;
2079
2080 for (User *U : LI->users()) {
2081 auto *UI = cast<ExtractElementInst>(U);
2082
2083 auto ScalarIdx = canScalarizeAccess(VecTy, UI->getIndexOperand(),
2084 SQ.getWithInstruction(LI));
2085 if (ScalarIdx.isUnsafe())
2086 return false;
2087 if (ScalarIdx.isSafeWithFreeze()) {
2088 NeedFreeze.try_emplace(UI, ScalarIdx);
2089 ScalarIdx.discard();
2090 }
2091
2092 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2093 OriginalCost +=
2094 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2095 Index ? Index->getZExtValue() : -1);
2096 ScalarizedCost +=
2097 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2099 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2100 nullptr, nullptr, CostKind);
2101 }
2102
2103 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2104 << "\n LoadExtractCost: " << OriginalCost
2105 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2106
2107 if (ScalarizedCost >= OriginalCost)
2108 return false;
2109
2110 // Ensure we add the load back to the worklist BEFORE its users so they can
2111 // erased in the correct order.
2112 Worklist.push(LI);
2113
2114 Type *ElemType = VecTy->getElementType();
2115
2116 // Replace extracts with narrow scalar loads.
2117 for (User *U : LI->users()) {
2118 auto *EI = cast<ExtractElementInst>(U);
2119 Value *Idx = EI->getIndexOperand();
2120
2121 // Insert 'freeze' for poison indexes.
2122 auto It = NeedFreeze.find(EI);
2123 if (It != NeedFreeze.end())
2124 It->second.freeze(Builder, *cast<Instruction>(Idx));
2125
2126 Builder.SetInsertPoint(EI);
2127 Value *GEP =
2128 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2129 auto *NewLoad = cast<LoadInst>(
2130 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2131
2132 Align ScalarOpAlignment =
2133 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2134 NewLoad->setAlignment(ScalarOpAlignment);
2135
2136 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2137 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2138 AAMDNodes OldAAMD = LI->getAAMetadata();
2139 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2140 }
2141
2142 replaceValue(*EI, *NewLoad, false);
2143 }
2144
2145 FailureGuard.release();
2146 return true;
2147}
2148
2149/// Try to scalarize vector loads feeding bitcast instructions.
2150bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2151 Value *Ptr) {
2152 InstructionCost OriginalCost =
2153 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2155
2156 Type *TargetScalarType = nullptr;
2157 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2158
2159 for (User *U : LI->users()) {
2160 auto *BC = cast<BitCastInst>(U);
2161
2162 Type *DestTy = BC->getDestTy();
2163 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2164 return false;
2165
2166 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2167 if (DestBitWidth != VecBitWidth)
2168 return false;
2169
2170 // All bitcasts must target the same scalar type.
2171 if (!TargetScalarType)
2172 TargetScalarType = DestTy;
2173 else if (TargetScalarType != DestTy)
2174 return false;
2175
2176 OriginalCost +=
2177 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2179 }
2180
2181 if (!TargetScalarType)
2182 return false;
2183
2184 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2185 InstructionCost ScalarizedCost =
2186 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2188
2189 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2190 << "\n OriginalCost: " << OriginalCost
2191 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2192
2193 if (ScalarizedCost >= OriginalCost)
2194 return false;
2195
2196 // Ensure we add the load back to the worklist BEFORE its users so they can
2197 // erased in the correct order.
2198 Worklist.push(LI);
2199
2200 Builder.SetInsertPoint(LI);
2201 auto *ScalarLoad =
2202 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2203 ScalarLoad->setAlignment(LI->getAlign());
2204 ScalarLoad->copyMetadata(*LI);
2205
2206 // Replace all bitcast users with the scalar load.
2207 for (User *U : LI->users()) {
2208 auto *BC = cast<BitCastInst>(U);
2209 replaceValue(*BC, *ScalarLoad, false);
2210 }
2211
2212 return true;
2213}
2214
2215bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2217 return false;
2218 auto *Ext = dyn_cast<ZExtInst>(&I);
2219 if (!Ext)
2220 return false;
2221
2222 // Try to convert a vector zext feeding only extracts to a set of scalar
2223 // (Src << ExtIdx *Size) & (Size -1)
2224 // if profitable .
2225 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2226 if (!SrcTy)
2227 return false;
2228 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2229
2230 Type *ScalarDstTy = DstTy->getElementType();
2231 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2232 return false;
2233
2234 InstructionCost VectorCost =
2235 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2237 unsigned ExtCnt = 0;
2238 bool ExtLane0 = false;
2239 for (User *U : Ext->users()) {
2240 uint64_t Idx;
2241 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2242 return false;
2243 if (cast<Instruction>(U)->use_empty())
2244 continue;
2245 ExtCnt += 1;
2246 ExtLane0 |= !Idx;
2247 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2248 CostKind, Idx, U);
2249 }
2250
2251 InstructionCost ScalarCost =
2252 ExtCnt * TTI.getArithmeticInstrCost(
2253 Instruction::And, ScalarDstTy, CostKind,
2256 (ExtCnt - ExtLane0) *
2258 Instruction::LShr, ScalarDstTy, CostKind,
2261 if (ScalarCost > VectorCost)
2262 return false;
2263
2264 Value *ScalarV = Ext->getOperand(0);
2265 if (!isGuaranteedNotToBePoison(ScalarV, SQ.AC, dyn_cast<Instruction>(ScalarV),
2266 SQ.DT)) {
2267 // Check wether all lanes are extracted, all extracts trigger UB
2268 // on poison, and the last extract (and hence all previous ones)
2269 // are guaranteed to execute if Ext executes. If so, we do not
2270 // need to insert a freeze.
2271 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2272 bool AllExtractsTriggerUB = true;
2273 ExtractElementInst *LastExtract = nullptr;
2274 BasicBlock *ExtBB = Ext->getParent();
2275 for (User *U : Ext->users()) {
2276 auto *Extract = cast<ExtractElementInst>(U);
2277 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2278 AllExtractsTriggerUB = false;
2279 break;
2280 }
2281 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2282 if (!LastExtract || LastExtract->comesBefore(Extract))
2283 LastExtract = Extract;
2284 }
2285 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2286 !AllExtractsTriggerUB ||
2288 LastExtract->getIterator()))
2289 ScalarV = Builder.CreateFreeze(ScalarV);
2290 }
2291 ScalarV = Builder.CreateBitCast(
2292 ScalarV,
2293 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2294 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2295 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2296 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2297 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2298 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2299 for (User *U : Ext->users()) {
2300 auto *Extract = cast<ExtractElementInst>(U);
2301 uint64_t Idx =
2302 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2303 uint64_t ShiftAmt =
2304 DL->isBigEndian()
2305 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2306 : (Idx * SrcEltSizeInBits);
2307 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2308 Value *And = Builder.CreateAnd(LShr, Mask);
2309 U->replaceAllUsesWith(And);
2310 }
2311 return true;
2312}
2313
2314/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2315/// to "(bitcast (concat X, Y))"
2316/// where X/Y are bitcasted from i1 mask vectors.
2317bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2318 Type *Ty = I.getType();
2319 if (!Ty->isIntegerTy())
2320 return false;
2321
2322 // TODO: Add big endian test coverage
2323 if (DL->isBigEndian())
2324 return false;
2325
2326 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2327 Instruction *X, *Y;
2329 return false;
2330
2331 // Allow both sources to contain shl, to handle more generic pattern:
2332 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2333 Value *SrcX;
2334 uint64_t ShAmtX = 0;
2335 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2336 !match(X, m_OneUse(
2338 m_ConstantInt(ShAmtX)))))
2339 return false;
2340
2341 Value *SrcY;
2342 uint64_t ShAmtY = 0;
2343 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2344 !match(Y, m_OneUse(
2346 m_ConstantInt(ShAmtY)))))
2347 return false;
2348
2349 // Canonicalize larger shift to the RHS.
2350 if (ShAmtX > ShAmtY) {
2351 std::swap(X, Y);
2352 std::swap(SrcX, SrcY);
2353 std::swap(ShAmtX, ShAmtY);
2354 }
2355
2356 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2357 // difference is the mask width so they can be easily concatenated together.
2358 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2359 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2360 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2361 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2362 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2363 !MaskTy->getElementType()->isIntegerTy(1) ||
2364 MaskTy->getNumElements() != ShAmtDiff ||
2365 MaskTy->getNumElements() > (BitWidth / 2))
2366 return false;
2367
2368 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2369 auto *ConcatIntTy =
2370 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2371 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2372
2373 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2374 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2375
2376 // TODO: Is it worth supporting multi use cases?
2377 InstructionCost OldCost = 0;
2378 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2379 OldCost +=
2380 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2381 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2383 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2385
2386 InstructionCost NewCost = 0;
2388 MaskTy, ConcatMask, CostKind);
2389 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2391 if (Ty != ConcatIntTy)
2392 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2394 if (ShAmtX > 0)
2395 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2396
2397 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2398 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2399 << "\n");
2400
2401 if (NewCost > OldCost)
2402 return false;
2403
2404 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2405 // any residual zero-extension or shifting.
2406 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2407 Worklist.pushValue(Concat);
2408
2409 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2410
2411 if (Ty != ConcatIntTy) {
2412 Worklist.pushValue(Result);
2413 Result = Builder.CreateZExt(Result, Ty);
2414 }
2415
2416 if (ShAmtX > 0) {
2417 Worklist.pushValue(Result);
2418 Result = Builder.CreateShl(Result, ShAmtX);
2419 }
2420
2421 replaceValue(I, *Result);
2422 return true;
2423}
2424
2425/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2426/// --> "binop (shuffle), (shuffle)".
2427bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2428 BinaryOperator *BinOp;
2429 ArrayRef<int> OuterMask;
2430 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2431 return false;
2432
2433 // Don't introduce poison into div/rem.
2434 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2435 return false;
2436
2437 Value *Op00, *Op01, *Op10, *Op11;
2438 ArrayRef<int> Mask0, Mask1;
2439 bool Match0 = match(BinOp->getOperand(0),
2440 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2441 bool Match1 = match(BinOp->getOperand(1),
2442 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2443 if (!Match0 && !Match1)
2444 return false;
2445
2446 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2447 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2448 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2449 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2450
2451 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2452 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2453 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2454 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2455 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2456 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2457 return false;
2458
2459 unsigned NumSrcElts = BinOpTy->getNumElements();
2460
2461 // Don't accept shuffles that reference the second operand in
2462 // div/rem or if its an undef arg.
2463 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2464 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2465 return false;
2466
2467 // Merge outer / inner (or identity if no match) shuffles.
2468 SmallVector<int> NewMask0, NewMask1;
2469 for (int M : OuterMask) {
2470 if (M < 0 || M >= (int)NumSrcElts) {
2471 NewMask0.push_back(PoisonMaskElem);
2472 NewMask1.push_back(PoisonMaskElem);
2473 } else {
2474 NewMask0.push_back(Match0 ? Mask0[M] : M);
2475 NewMask1.push_back(Match1 ? Mask1[M] : M);
2476 }
2477 }
2478
2479 unsigned NumOpElts = Op0Ty->getNumElements();
2480 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2481 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2482 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2483 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2484 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2485 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2486
2487 InstructionCost NewCost = 0;
2488 // Try to merge shuffles across the binop if the new shuffles are not costly.
2489 InstructionCost BinOpCost =
2490 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2491 InstructionCost OldCost =
2493 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2494 0, nullptr, {BinOp}, &I);
2495 if (!BinOp->hasOneUse())
2496 NewCost += BinOpCost;
2497
2498 if (Match0) {
2500 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2501 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2502 OldCost += Shuf0Cost;
2503 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2504 NewCost += Shuf0Cost;
2505 }
2506 if (Match1) {
2508 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2509 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2510 OldCost += Shuf1Cost;
2511 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2512 NewCost += Shuf1Cost;
2513 }
2514
2515 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2516
2517 if (!IsIdentity0)
2518 NewCost +=
2520 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2521 if (!IsIdentity1)
2522 NewCost +=
2524 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2525
2526 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2527 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2528 << "\n");
2529
2530 // If costs are equal, still fold as we reduce instruction count.
2531 if (NewCost > OldCost)
2532 return false;
2533
2534 Value *LHS =
2535 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2536 Value *RHS =
2537 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2538 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2539
2540 // Intersect flags from the old binops.
2541 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2542 NewInst->copyIRFlags(BinOp);
2543
2544 Worklist.pushValue(LHS);
2545 Worklist.pushValue(RHS);
2546 replaceValue(I, *NewBO);
2547 return true;
2548}
2549
2550/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2551/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2552bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2553 ArrayRef<int> OldMask;
2554 Instruction *LHS, *RHS;
2556 m_Mask(OldMask))))
2557 return false;
2558
2559 // TODO: Add support for addlike etc.
2560 if (LHS->getOpcode() != RHS->getOpcode())
2561 return false;
2562
2563 Value *X, *Y, *Z, *W;
2564 bool IsCommutative = false;
2565 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2566 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2567 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2568 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2569 auto *BO = cast<BinaryOperator>(LHS);
2570 // Don't introduce poison into div/rem.
2571 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2572 return false;
2573 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2574 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2575 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2576 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2577 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2578 } else
2579 return false;
2580
2581 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2582 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2583 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2584 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2585 return false;
2586
2587 bool SameBinOp = LHS == RHS;
2588 unsigned NumSrcElts = BinOpTy->getNumElements();
2589
2590 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2591 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2592 std::swap(X, Y);
2593
2594 auto ConvertToUnary = [NumSrcElts](int &M) {
2595 if (M >= (int)NumSrcElts)
2596 M -= NumSrcElts;
2597 };
2598
2599 SmallVector<int> NewMask0(OldMask);
2601 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2602 if (X == Z) {
2603 llvm::for_each(NewMask0, ConvertToUnary);
2605 Z = PoisonValue::get(BinOpTy);
2606 }
2607
2608 SmallVector<int> NewMask1(OldMask);
2610 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2611 if (Y == W) {
2612 llvm::for_each(NewMask1, ConvertToUnary);
2614 W = PoisonValue::get(BinOpTy);
2615 }
2616
2617 // Try to replace a binop with a shuffle if the shuffle is not costly.
2618 // When SameBinOp, only count the binop cost once.
2621
2622 InstructionCost OldCost = LHSCost;
2623 if (!SameBinOp) {
2624 OldCost += RHSCost;
2625 }
2627 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2628 nullptr, {LHS, RHS}, &I);
2629
2630 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2631 // where one use shuffles have gotten split across the binop/cmp. These
2632 // often allow a major reduction in total cost that wouldn't happen as
2633 // individual folds.
2634 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2635 TTI::TargetCostKind CostKind) -> bool {
2636 Value *InnerOp;
2637 ArrayRef<int> InnerMask;
2638 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2639 m_Mask(InnerMask)))) &&
2640 InnerOp->getType() == Op->getType() &&
2641 all_of(InnerMask,
2642 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2643 for (int &M : Mask)
2644 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2645 M = InnerMask[M - Offset];
2646 M = 0 <= M ? M + Offset : M;
2647 }
2649 Op = InnerOp;
2650 return true;
2651 }
2652 return false;
2653 };
2654 bool ReducedInstCount = false;
2655 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2656 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2657 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2658 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2659 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2660 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2661 // original binop(s). If binops have multiple uses, they won't be eliminated.
2662 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2663
2664 auto *ShuffleCmpTy =
2665 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2667 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2668 if (!SingleSrcBinOp)
2669 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2670 CostKind, 0, nullptr, {Y, W});
2671
2672 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2673 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2674 CostKind, Op0Info, Op1Info);
2675 } else {
2676 NewCost +=
2677 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2678 PredLHS, CostKind, Op0Info, Op1Info);
2679 }
2680 // If LHS/RHS have other uses, we need to account for the cost of keeping
2681 // the original instructions. When SameBinOp, only add the cost once.
2682 if (!LHS->hasOneUser())
2683 NewCost += LHSCost;
2684 if (!SameBinOp && !RHS->hasOneUser())
2685 NewCost += RHSCost;
2686
2687 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2688 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2689 << "\n");
2690
2691 // If either shuffle will constant fold away, then fold for the same cost as
2692 // we will reduce the instruction count.
2693 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2694 (isa<Constant>(Y) && isa<Constant>(W));
2695 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2696 return false;
2697
2698 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2699 Value *Shuf1 =
2700 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2701 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2702 ? Builder.CreateBinOp(
2703 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2704 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2705
2706 // Intersect flags from the old binops.
2707 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2708 NewInst->copyIRFlags(LHS);
2709 NewInst->andIRFlags(RHS);
2710 }
2711
2712 Worklist.pushValue(Shuf0);
2713 Worklist.pushValue(Shuf1);
2714 replaceValue(I, *NewBO);
2715 return true;
2716}
2717
2718/// Try to convert,
2719/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2720/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2721bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2722 ArrayRef<int> Mask;
2723 Value *C1, *T1, *F1, *C2, *T2, *F2;
2724 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2725 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2726 m_Mask(Mask))))
2727 return false;
2728
2729 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2730 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2731
2732 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2733 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2734 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2735 return false;
2736
2737 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2738 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2739 // SelectInsts must have the same FMF.
2740 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2741 ((SI0FOp != nullptr) &&
2742 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2743 return false;
2744
2745 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2746 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2748 auto SelOp = Instruction::Select;
2749
2751 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2753 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2754
2755 InstructionCost OldCost =
2756 CostSel1 + CostSel2 +
2757 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2758 {I.getOperand(0), I.getOperand(1)}, &I);
2759
2761 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2762 Mask, CostKind, 0, nullptr, {C1, C2});
2763 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2764 nullptr, {T1, T2});
2765 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2766 nullptr, {F1, F2});
2767 auto *C1C2ShuffledVecTy = FixedVectorType::get(
2768 Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements());
2769 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2771
2772 if (!Sel1->hasOneUse())
2773 NewCost += CostSel1;
2774 if (!Sel2->hasOneUse())
2775 NewCost += CostSel2;
2776
2777 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2778 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2779 << "\n");
2780 if (NewCost > OldCost)
2781 return false;
2782
2783 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2784 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2785 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2786 Value *NewSel;
2787 // We presuppose that the SelectInsts have the same FMF.
2788 if (SI0FOp)
2789 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2790 SI0FOp->getFastMathFlags());
2791 else
2792 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2793
2794 Worklist.pushValue(ShuffleCmp);
2795 Worklist.pushValue(ShuffleTrue);
2796 Worklist.pushValue(ShuffleFalse);
2797 replaceValue(I, *NewSel);
2798 return true;
2799}
2800
2801/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2802/// into "castop (shuffle)".
2803bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2804 Value *V0, *V1;
2805 ArrayRef<int> OldMask;
2806 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2807 return false;
2808
2809 // Check whether this is a binary shuffle.
2810 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2811
2812 auto *C0 = dyn_cast<CastInst>(V0);
2813 auto *C1 = dyn_cast<CastInst>(V1);
2814 if (!C0 || (IsBinaryShuffle && !C1))
2815 return false;
2816
2817 Instruction::CastOps Opcode = C0->getOpcode();
2818
2819 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2820 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2821 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2822 return false;
2823
2824 if (IsBinaryShuffle) {
2825 if (C0->getSrcTy() != C1->getSrcTy())
2826 return false;
2827 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2828 if (Opcode != C1->getOpcode()) {
2829 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2830 Opcode = Instruction::SExt;
2831 else
2832 return false;
2833 }
2834 }
2835
2836 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2837 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2838 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2839 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2840 return false;
2841
2842 unsigned NumSrcElts = CastSrcTy->getNumElements();
2843 unsigned NumDstElts = CastDstTy->getNumElements();
2844 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2845 "Only bitcasts expected to alter src/dst element counts");
2846
2847 // Check for bitcasting of unscalable vector types.
2848 // e.g. <32 x i40> -> <40 x i32>
2849 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2850 (NumDstElts % NumSrcElts) != 0)
2851 return false;
2852
2853 SmallVector<int, 16> NewMask;
2854 if (NumSrcElts >= NumDstElts) {
2855 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2856 // always be expanded to the equivalent form choosing narrower elements.
2857 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2858 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2859 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2860 } else {
2861 // The bitcast is from narrow elements to wide elements. The shuffle mask
2862 // must choose consecutive elements to allow casting first.
2863 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2864 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2865 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2866 return false;
2867 }
2868
2869 auto *NewShuffleDstTy =
2870 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2871
2872 // Try to replace a castop with a shuffle if the shuffle is not costly.
2873 InstructionCost CostC0 =
2874 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2876
2878 if (IsBinaryShuffle)
2880 else
2882
2883 InstructionCost OldCost = CostC0;
2884 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2885 CostKind, 0, nullptr, {}, &I);
2886
2887 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2888 CastSrcTy, NewMask, CostKind);
2889 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2891 if (!C0->hasOneUse())
2892 NewCost += CostC0;
2893 if (IsBinaryShuffle) {
2894 InstructionCost CostC1 =
2895 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2897 OldCost += CostC1;
2898 if (!C1->hasOneUse())
2899 NewCost += CostC1;
2900 }
2901
2902 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2903 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2904 << "\n");
2905 if (NewCost > OldCost)
2906 return false;
2907
2908 Value *Shuf;
2909 if (IsBinaryShuffle)
2910 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2911 NewMask);
2912 else
2913 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2914
2915 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2916
2917 // Intersect flags from the old casts.
2918 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2919 NewInst->copyIRFlags(C0);
2920 if (IsBinaryShuffle)
2921 NewInst->andIRFlags(C1);
2922 }
2923
2924 Worklist.pushValue(Shuf);
2925 replaceValue(I, *Cast);
2926 return true;
2927}
2928
2929/// Try to convert any of:
2930/// "shuffle (shuffle x, y), (shuffle y, x)"
2931/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2932/// "shuffle (shuffle x, undef), y"
2933/// "shuffle x, (shuffle y, undef)"
2934/// into "shuffle x, y".
2935bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2936 ArrayRef<int> OuterMask;
2937 Value *OuterV0, *OuterV1;
2938 if (!match(&I,
2939 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2940 return false;
2941
2942 ArrayRef<int> InnerMask0, InnerMask1;
2943 Value *X0, *X1, *Y0, *Y1;
2944 bool Match0 =
2945 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2946 bool Match1 =
2947 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2948 if (!Match0 && !Match1)
2949 return false;
2950
2951 // If the outer shuffle is a permute, then create a fake inner all-poison
2952 // shuffle. This is easier than accounting for length-changing shuffles below.
2953 SmallVector<int, 16> PoisonMask1;
2954 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2955 X1 = X0;
2956 Y1 = Y0;
2957 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2958 InnerMask1 = PoisonMask1;
2959 Match1 = true; // fake match
2960 }
2961
2962 X0 = Match0 ? X0 : OuterV0;
2963 Y0 = Match0 ? Y0 : OuterV0;
2964 X1 = Match1 ? X1 : OuterV1;
2965 Y1 = Match1 ? Y1 : OuterV1;
2966 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2967 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2968 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2969 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2970 X0->getType() != X1->getType())
2971 return false;
2972
2973 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2974 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2975
2976 // Attempt to merge shuffles, matching upto 2 source operands.
2977 // Replace index to a poison arg with PoisonMaskElem.
2978 // Bail if either inner masks reference an undef arg.
2979 SmallVector<int, 16> NewMask(OuterMask);
2980 Value *NewX = nullptr, *NewY = nullptr;
2981 for (int &M : NewMask) {
2982 Value *Src = nullptr;
2983 if (0 <= M && M < (int)NumImmElts) {
2984 Src = OuterV0;
2985 if (Match0) {
2986 M = InnerMask0[M];
2987 Src = M >= (int)NumSrcElts ? Y0 : X0;
2988 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2989 }
2990 } else if (M >= (int)NumImmElts) {
2991 Src = OuterV1;
2992 M -= NumImmElts;
2993 if (Match1) {
2994 M = InnerMask1[M];
2995 Src = M >= (int)NumSrcElts ? Y1 : X1;
2996 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2997 }
2998 }
2999 if (Src && M != PoisonMaskElem) {
3000 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
3001 if (isa<UndefValue>(Src)) {
3002 // We've referenced an undef element - if its poison, update the shuffle
3003 // mask, else bail.
3004 if (!isa<PoisonValue>(Src))
3005 return false;
3006 M = PoisonMaskElem;
3007 continue;
3008 }
3009 if (!NewX || NewX == Src) {
3010 NewX = Src;
3011 continue;
3012 }
3013 if (!NewY || NewY == Src) {
3014 M += NumSrcElts;
3015 NewY = Src;
3016 continue;
3017 }
3018 return false;
3019 }
3020 }
3021
3022 if (!NewX)
3023 return PoisonValue::get(ShuffleDstTy);
3024 if (!NewY)
3025 NewY = PoisonValue::get(ShuffleSrcTy);
3026
3027 // Have we folded to an Identity shuffle?
3028 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3029 replaceValue(I, *NewX);
3030 return true;
3031 }
3032
3033 // Try to merge the shuffles if the new shuffle is not costly.
3034 InstructionCost InnerCost0 = 0;
3035 if (Match0)
3036 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3037
3038 InstructionCost InnerCost1 = 0;
3039 if (Match1)
3040 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3041
3043
3044 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3045
3046 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3050 InstructionCost NewCost =
3051 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3052 nullptr, {NewX, NewY});
3053 if (!OuterV0->hasOneUse())
3054 NewCost += InnerCost0;
3055 if (!OuterV1->hasOneUse())
3056 NewCost += InnerCost1;
3057
3058 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3059 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3060 << "\n");
3061 if (NewCost > OldCost)
3062 return false;
3063
3064 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3065 replaceValue(I, *Shuf);
3066 return true;
3067}
3068
3069/// Try to convert a chain of length-preserving shuffles that are fed by
3070/// length-changing shuffles from the same source, e.g. a chain of length 3:
3071///
3072/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3073/// (shuffle y, undef)),
3074// (shuffle y, undef)"
3075///
3076/// into a single shuffle fed by a length-changing shuffle:
3077///
3078/// "shuffle x, (shuffle y, undef)"
3079///
3080/// Such chains arise e.g. from folding extract/insert sequences.
3081bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3082 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3083 if (!TrunkType)
3084 return false;
3085
3086 unsigned ChainLength = 0;
3087 SmallVector<int> Mask;
3088 SmallVector<int> YMask;
3089 InstructionCost OldCost = 0;
3090 InstructionCost NewCost = 0;
3091 Value *Trunk = &I;
3092 unsigned NumTrunkElts = TrunkType->getNumElements();
3093 Value *Y = nullptr;
3094
3095 for (;;) {
3096 // Match the current trunk against (commutations of) the pattern
3097 // "shuffle trunk', (shuffle y, undef)"
3098 ArrayRef<int> OuterMask;
3099 Value *OuterV0, *OuterV1;
3100 if (ChainLength != 0 && !Trunk->hasOneUse())
3101 break;
3102 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3103 m_Mask(OuterMask))))
3104 break;
3105 if (OuterV0->getType() != TrunkType) {
3106 // This shuffle is not length-preserving, so it cannot be part of the
3107 // chain.
3108 break;
3109 }
3110
3111 ArrayRef<int> InnerMask0, InnerMask1;
3112 Value *A0, *A1, *B0, *B1;
3113 bool Match0 =
3114 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3115 bool Match1 =
3116 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3117 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3118 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3119 if (Match0Leaf == Match1Leaf) {
3120 // Only handle the case of exactly one leaf in each step. The "two leaves"
3121 // case is handled by foldShuffleOfShuffles.
3122 break;
3123 }
3124
3125 SmallVector<int> CommutedOuterMask;
3126 if (Match0Leaf) {
3127 std::swap(OuterV0, OuterV1);
3128 std::swap(InnerMask0, InnerMask1);
3129 std::swap(A0, A1);
3130 std::swap(B0, B1);
3131 llvm::append_range(CommutedOuterMask, OuterMask);
3132 for (int &M : CommutedOuterMask) {
3133 if (M == PoisonMaskElem)
3134 continue;
3135 if (M < (int)NumTrunkElts)
3136 M += NumTrunkElts;
3137 else
3138 M -= NumTrunkElts;
3139 }
3140 OuterMask = CommutedOuterMask;
3141 }
3142 if (!OuterV1->hasOneUse())
3143 break;
3144
3145 if (!isa<UndefValue>(A1)) {
3146 if (!Y)
3147 Y = A1;
3148 else if (Y != A1)
3149 break;
3150 }
3151 if (!isa<UndefValue>(B1)) {
3152 if (!Y)
3153 Y = B1;
3154 else if (Y != B1)
3155 break;
3156 }
3157
3158 auto *YType = cast<FixedVectorType>(A1->getType());
3159 int NumLeafElts = YType->getNumElements();
3160 SmallVector<int> LocalYMask(InnerMask1);
3161 for (int &M : LocalYMask) {
3162 if (M >= NumLeafElts)
3163 M -= NumLeafElts;
3164 }
3165
3166 InstructionCost LocalOldCost =
3169
3170 // Handle the initial (start of chain) case.
3171 if (!ChainLength) {
3172 Mask.assign(OuterMask);
3173 YMask.assign(LocalYMask);
3174 OldCost = NewCost = LocalOldCost;
3175 Trunk = OuterV0;
3176 ChainLength++;
3177 continue;
3178 }
3179
3180 // For the non-root case, first attempt to combine masks.
3181 SmallVector<int> NewYMask(YMask);
3182 bool Valid = true;
3183 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3184 if (LeafM == -1 || CombinedM == LeafM)
3185 continue;
3186 if (CombinedM == -1) {
3187 CombinedM = LeafM;
3188 } else {
3189 Valid = false;
3190 break;
3191 }
3192 }
3193 if (!Valid)
3194 break;
3195
3196 SmallVector<int> NewMask;
3197 NewMask.reserve(NumTrunkElts);
3198 for (int M : Mask) {
3199 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3200 NewMask.push_back(M);
3201 else
3202 NewMask.push_back(OuterMask[M]);
3203 }
3204
3205 // Break the chain if adding this new step complicates the shuffles such
3206 // that it would increase the new cost by more than the old cost of this
3207 // step.
3208 InstructionCost LocalNewCost =
3210 YType, NewYMask, CostKind) +
3212 TrunkType, NewMask, CostKind);
3213
3214 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3215 break;
3216
3217 LLVM_DEBUG({
3218 if (ChainLength == 1) {
3219 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3220 << I << '\n';
3221 }
3222 dbgs() << " next chain link: " << *Trunk << '\n'
3223 << " old cost: " << (OldCost + LocalOldCost)
3224 << " new cost: " << LocalNewCost << '\n';
3225 });
3226
3227 Mask = NewMask;
3228 YMask = NewYMask;
3229 OldCost += LocalOldCost;
3230 NewCost = LocalNewCost;
3231 Trunk = OuterV0;
3232 ChainLength++;
3233 }
3234 if (ChainLength <= 1)
3235 return false;
3236
3237 if (llvm::all_of(Mask, [&](int M) {
3238 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3239 })) {
3240 // Produce a canonical simplified form if all elements are sourced from Y.
3241 for (int &M : Mask) {
3242 if (M >= static_cast<int>(NumTrunkElts))
3243 M = YMask[M - NumTrunkElts];
3244 }
3245 Value *Root =
3246 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3247 replaceValue(I, *Root);
3248 return true;
3249 }
3250
3251 Value *Leaf =
3252 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3253 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3254 replaceValue(I, *Root);
3255 return true;
3256}
3257
3258/// Try to convert
3259/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3260bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3261 Value *V0, *V1;
3262 ArrayRef<int> OldMask;
3263 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3264 return false;
3265
3266 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3267 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3268 if (!II0 || !II1)
3269 return false;
3270
3271 Intrinsic::ID IID = II0->getIntrinsicID();
3272 if (IID != II1->getIntrinsicID())
3273 return false;
3274 InstructionCost CostII0 =
3275 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3276 InstructionCost CostII1 =
3277 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3278
3279 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3280 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3281 if (!ShuffleDstTy || !II0Ty)
3282 return false;
3283
3284 if (!isTriviallyVectorizable(IID))
3285 return false;
3286
3287 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3289 II0->getArgOperand(I) != II1->getArgOperand(I))
3290 return false;
3291
3292 InstructionCost OldCost =
3293 CostII0 + CostII1 +
3295 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3296
3297 SmallVector<Type *> NewArgsTy;
3298 InstructionCost NewCost = 0;
3299 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3300 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3302 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3303 } else {
3304 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3305 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3306 ShuffleDstTy->getNumElements());
3307 NewArgsTy.push_back(ArgTy);
3308 std::pair<Value *, Value *> OperandPair =
3309 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3310 if (!SeenOperandPairs.insert(OperandPair).second) {
3311 // We've already computed the cost for this operand pair.
3312 continue;
3313 }
3314 NewCost += TTI.getShuffleCost(
3315 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3316 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3317 }
3318 }
3319 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3320
3321 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3322 if (!II0->hasOneUse())
3323 NewCost += CostII0;
3324 if (II1 != II0 && !II1->hasOneUse())
3325 NewCost += CostII1;
3326
3327 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3328 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3329 << "\n");
3330
3331 if (NewCost > OldCost)
3332 return false;
3333
3334 SmallVector<Value *> NewArgs;
3335 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3336 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3338 NewArgs.push_back(II0->getArgOperand(I));
3339 } else {
3340 std::pair<Value *, Value *> OperandPair =
3341 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3342 auto It = ShuffleCache.find(OperandPair);
3343 if (It != ShuffleCache.end()) {
3344 // Reuse previously created shuffle for this operand pair.
3345 NewArgs.push_back(It->second);
3346 continue;
3347 }
3348 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3349 II1->getArgOperand(I), OldMask);
3350 ShuffleCache[OperandPair] = Shuf;
3351 NewArgs.push_back(Shuf);
3352 Worklist.pushValue(Shuf);
3353 }
3354 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3355
3356 // Intersect flags from the old intrinsics.
3357 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3358 NewInst->copyIRFlags(II0);
3359 NewInst->andIRFlags(II1);
3360 }
3361
3362 replaceValue(I, *NewIntrinsic);
3363 return true;
3364}
3365
3366/// Try to convert
3367/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3368bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3369 Value *V0;
3370 ArrayRef<int> Mask;
3371 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3372 return false;
3373
3374 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3375 if (!II0)
3376 return false;
3377
3378 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3379 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3380 if (!ShuffleDstTy || !IntrinsicSrcTy)
3381 return false;
3382
3383 // Validate it's a pure permute, mask should only reference the first vector
3384 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3385 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3386 return false;
3387
3388 Intrinsic::ID IID = II0->getIntrinsicID();
3389 if (!isTriviallyVectorizable(IID))
3390 return false;
3391
3392 // Cost analysis
3394 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3395 InstructionCost OldCost =
3398 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3399
3400 SmallVector<Type *> NewArgsTy;
3401 InstructionCost NewCost = 0;
3402 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3404 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3405 } else {
3406 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3407 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3408 ShuffleDstTy->getNumElements());
3409 NewArgsTy.push_back(ArgTy);
3411 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3412 {II0->getArgOperand(I)});
3413 }
3414 }
3415 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3416 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3417
3418 // If the intrinsic has multiple uses, we need to account for the cost of
3419 // keeping the original intrinsic around.
3420 if (!II0->hasOneUse())
3421 NewCost += IntrinsicCost;
3422
3423 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3424 << OldCost << " vs NewCost: " << NewCost << "\n");
3425
3426 if (NewCost > OldCost)
3427 return false;
3428
3429 // Transform
3430 SmallVector<Value *> NewArgs;
3431 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3433 NewArgs.push_back(II0->getArgOperand(I));
3434 } else {
3435 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3436 NewArgs.push_back(Shuf);
3437 Worklist.pushValue(Shuf);
3438 }
3439 }
3440
3441 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3442
3443 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3444 NewInst->copyIRFlags(II0);
3445
3446 replaceValue(I, *NewIntrinsic);
3447 return true;
3448}
3449
3450using InstLane = std::pair<Value *, int>;
3451
3452static InstLane lookThroughShuffles(Value *V, int Lane) {
3453 while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
3454 unsigned NumElts =
3455 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3456 int M = SV->getMaskValue(Lane);
3457 if (M < 0)
3458 return {nullptr, PoisonMaskElem};
3459 if (static_cast<unsigned>(M) < NumElts) {
3460 V = SV->getOperand(0);
3461 Lane = M;
3462 } else {
3463 V = SV->getOperand(1);
3464 Lane = M - NumElts;
3465 }
3466 }
3467 return InstLane{V, Lane};
3468}
3469
3473 for (InstLane IL : Item) {
3474 auto [U, Lane] = IL;
3475 InstLane OpLane =
3476 U ? lookThroughShuffles(cast<Instruction>(U)->getOperand(Op), Lane)
3477 : InstLane{nullptr, PoisonMaskElem};
3478 NItem.emplace_back(OpLane);
3479 }
3480 return NItem;
3481}
3482
3483/// Detect concat of multiple values into a vector
3485 const TargetTransformInfo &TTI) {
3486 auto *Ty = cast<FixedVectorType>(Item.front().first->getType());
3487 unsigned NumElts = Ty->getNumElements();
3488 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3489 return false;
3490
3491 // Check that the concat is free, usually meaning that the type will be split
3492 // during legalization.
3493 SmallVector<int, 16> ConcatMask(NumElts * 2);
3494 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3495 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3496 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3497 Ty, ConcatMask, CostKind) != 0)
3498 return false;
3499
3500 unsigned NumSlices = Item.size() / NumElts;
3501 // Currently we generate a tree of shuffles for the concats, which limits us
3502 // to a power2.
3503 if (!isPowerOf2_32(NumSlices))
3504 return false;
3505 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3506 Value *SliceV = Item[Slice * NumElts].first;
3507 if (!SliceV || SliceV->getType() != Ty)
3508 return false;
3509 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3510 auto [V, Lane] = Item[Slice * NumElts + Elt];
3511 if (Lane != static_cast<int>(Elt) || SliceV != V)
3512 return false;
3513 }
3514 }
3515 return true;
3516}
3517
3518static Value *
3520 const DenseSet<std::pair<Value *, Use *>> &IdentityLeafs,
3521 const DenseSet<std::pair<Value *, Use *>> &SplatLeafs,
3522 const DenseSet<std::pair<Value *, Use *>> &ConcatLeafs,
3523 IRBuilderBase &Builder, const TargetTransformInfo *TTI) {
3524 auto [FrontV, FrontLane] = Item.front();
3525
3526 if (IdentityLeafs.contains(std::make_pair(FrontV, From))) {
3527 return FrontV;
3528 }
3529 if (SplatLeafs.contains(std::make_pair(FrontV, From))) {
3530 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3531 return Builder.CreateShuffleVector(FrontV, Mask);
3532 }
3533 if (ConcatLeafs.contains(std::make_pair(FrontV, From))) {
3534 unsigned NumElts =
3535 cast<FixedVectorType>(FrontV->getType())->getNumElements();
3536 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3537 for (unsigned S = 0; S < Values.size(); ++S)
3538 Values[S] = Item[S * NumElts].first;
3539
3540 while (Values.size() > 1) {
3541 NumElts *= 2;
3542 SmallVector<int, 16> Mask(NumElts, 0);
3543 std::iota(Mask.begin(), Mask.end(), 0);
3544 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3545 for (unsigned S = 0; S < NewValues.size(); ++S)
3546 NewValues[S] =
3547 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3548 Values = NewValues;
3549 }
3550 return Values[0];
3551 }
3552
3553 auto *I = cast<Instruction>(FrontV);
3554 auto *II = dyn_cast<IntrinsicInst>(I);
3555 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3557 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3558 if (II &&
3559 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3560 Ops[Idx] = II->getOperand(Idx);
3561 continue;
3562 }
3564 &I->getOperandUse(Idx), Ty, IdentityLeafs,
3565 SplatLeafs, ConcatLeafs, Builder, TTI);
3566 }
3567
3568 SmallVector<Value *, 8> ValueList;
3569 for (const auto &Lane : Item)
3570 if (Lane.first)
3571 ValueList.push_back(Lane.first);
3572
3573 Type *DstTy =
3574 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3575 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3576 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3577 Ops[0], Ops[1]);
3578 propagateIRFlags(Value, ValueList);
3579 return Value;
3580 }
3581 if (auto *CI = dyn_cast<CmpInst>(I)) {
3582 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3583 propagateIRFlags(Value, ValueList);
3584 return Value;
3585 }
3586 if (auto *SI = dyn_cast<SelectInst>(I)) {
3587 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3588 propagateIRFlags(Value, ValueList);
3589 return Value;
3590 }
3591 if (auto *CI = dyn_cast<CastInst>(I)) {
3592 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3593 propagateIRFlags(Value, ValueList);
3594 return Value;
3595 }
3596 if (II) {
3597 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3598 propagateIRFlags(Value, ValueList);
3599 return Value;
3600 }
3601 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3602 auto *Value =
3603 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3604 propagateIRFlags(Value, ValueList);
3605 return Value;
3606}
3607
3608// Starting from a shuffle, look up through operands tracking the shuffled index
3609// of each lane. If we can simplify away the shuffles to identities then
3610// do so.
3611bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3612 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3613 if (!Ty || I.use_empty())
3614 return false;
3615
3616 SmallVector<InstLane> Start(Ty->getNumElements());
3617 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3618 Start[M] = lookThroughShuffles(&I, M);
3619
3621 Worklist.push_back(std::make_pair(Start, &*I.use_begin()));
3622 DenseSet<std::pair<Value *, Use *>> IdentityLeafs, SplatLeafs, ConcatLeafs;
3623 unsigned NumVisited = 0;
3624
3625 while (!Worklist.empty()) {
3626 if (++NumVisited > MaxInstrsToScan)
3627 return false;
3628
3629 auto ItemFrom = Worklist.pop_back_val();
3630 auto Item = ItemFrom.first;
3631 auto From = ItemFrom.second;
3632 auto [FrontV, FrontLane] = Item.front();
3633
3634 // If we found an undef first lane then bail out to keep things simple.
3635 if (!FrontV)
3636 return false;
3637
3638 // Helper to peek through bitcasts to the same value.
3639 auto IsEquiv = [&](Value *X, Value *Y) {
3640 return X->getType() == Y->getType() &&
3642 };
3643
3644 // Look for an identity value.
3645 if (FrontLane == 0 &&
3646 cast<FixedVectorType>(FrontV->getType())->getNumElements() ==
3647 Ty->getNumElements() &&
3648 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3649 Value *FrontV = Item.front().first;
3650 return !E.value().first || (IsEquiv(E.value().first, FrontV) &&
3651 E.value().second == (int)E.index());
3652 })) {
3653 IdentityLeafs.insert(std::make_pair(FrontV, From));
3654 continue;
3655 }
3656 // Look for constants, for the moment only supporting constant splats.
3657 if (auto *C = dyn_cast<Constant>(FrontV);
3658 C && C->getSplatValue() &&
3659 all_of(drop_begin(Item), [Item](InstLane &IL) {
3660 Value *FrontV = Item.front().first;
3661 Value *V = IL.first;
3662 return !V || (isa<Constant>(V) &&
3663 cast<Constant>(V)->getSplatValue() ==
3664 cast<Constant>(FrontV)->getSplatValue());
3665 })) {
3666 SplatLeafs.insert(std::make_pair(FrontV, From));
3667 continue;
3668 }
3669 // Look for a splat value.
3670 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3671 auto [FrontV, FrontLane] = Item.front();
3672 auto [V, Lane] = IL;
3673 return !V || (V == FrontV && Lane == FrontLane);
3674 })) {
3675 SplatLeafs.insert(std::make_pair(FrontV, From));
3676 continue;
3677 }
3678
3679 // We need each element to be the same type of value, and check that each
3680 // element has a single use.
3681 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3682 Value *FrontV = Item.front().first;
3683 if (!IL.first)
3684 return true;
3685 Value *V = IL.first;
3686 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3687 return false;
3688 if (V->getValueID() != FrontV->getValueID())
3689 return false;
3690 if (auto *CI = dyn_cast<CmpInst>(V))
3691 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3692 return false;
3693 if (auto *CI = dyn_cast<CastInst>(V))
3694 if (CI->getSrcTy()->getScalarType() !=
3695 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3696 return false;
3697 if (auto *SI = dyn_cast<SelectInst>(V))
3698 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3699 SI->getOperand(0)->getType() !=
3700 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3701 return false;
3702 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3703 return false;
3704 auto *II = dyn_cast<IntrinsicInst>(V);
3705 return !II || (isa<IntrinsicInst>(FrontV) &&
3706 II->getIntrinsicID() ==
3707 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3708 !II->hasOperandBundles());
3709 };
3710 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3711 // Check the operator is one that we support.
3712 if (isa<BinaryOperator, CmpInst>(FrontV)) {
3713 // We exclude div/rem in case they hit UB from poison lanes.
3714 if (auto *BO = dyn_cast<BinaryOperator>(FrontV);
3715 BO && BO->isIntDivRem())
3716 return false;
3718 &cast<Instruction>(FrontV)->getOperandUse(0));
3720 &cast<Instruction>(FrontV)->getOperandUse(1));
3721 continue;
3722 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3723 FPToUIInst, SIToFPInst, UIToFPInst>(FrontV)) {
3725 &cast<Instruction>(FrontV)->getOperandUse(0));
3726 continue;
3727 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontV)) {
3728 // TODO: Handle vector widening/narrowing bitcasts.
3729 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3730 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3731 if (DstTy && SrcTy &&
3732 SrcTy->getNumElements() == DstTy->getNumElements()) {
3734 &BitCast->getOperandUse(0));
3735 continue;
3736 }
3737 } else if (auto *Sel = dyn_cast<SelectInst>(FrontV)) {
3739 &Sel->getOperandUse(0));
3741 &Sel->getOperandUse(1));
3743 &Sel->getOperandUse(2));
3744 continue;
3745 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
3746 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3747 !II->hasOperandBundles()) {
3748 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3749 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3750 &TTI)) {
3751 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3752 Value *FrontV = Item.front().first;
3753 Value *V = IL.first;
3754 return !V || (cast<Instruction>(V)->getOperand(Op) ==
3755 cast<Instruction>(FrontV)->getOperand(Op));
3756 }))
3757 return false;
3758 continue;
3759 }
3761 &cast<Instruction>(FrontV)->getOperandUse(Op));
3762 }
3763 continue;
3764 }
3765 }
3766
3767 if (isFreeConcat(Item, CostKind, TTI)) {
3768 ConcatLeafs.insert(std::make_pair(FrontV, From));
3769 continue;
3770 }
3771
3772 return false;
3773 }
3774
3775 if (NumVisited <= 1)
3776 return false;
3777
3778 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3779
3780 // If we got this far, we know the shuffles are superfluous and can be
3781 // removed. Scan through again and generate the new tree of instructions.
3782 Builder.SetInsertPoint(&I);
3783 Value *V = generateNewInstTree(Start, &*I.use_begin(), Ty, IdentityLeafs,
3784 SplatLeafs, ConcatLeafs, Builder, &TTI);
3785 replaceValue(I, *V);
3786 return true;
3787}
3788
3789/// Given a commutative reduction, the order of the input lanes does not alter
3790/// the results. We can use this to remove certain shuffles feeding the
3791/// reduction, removing the need to shuffle at all.
3792bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3793 auto *II = dyn_cast<IntrinsicInst>(&I);
3794 if (!II)
3795 return false;
3796 switch (II->getIntrinsicID()) {
3797 case Intrinsic::vector_reduce_add:
3798 case Intrinsic::vector_reduce_mul:
3799 case Intrinsic::vector_reduce_and:
3800 case Intrinsic::vector_reduce_or:
3801 case Intrinsic::vector_reduce_xor:
3802 case Intrinsic::vector_reduce_smin:
3803 case Intrinsic::vector_reduce_smax:
3804 case Intrinsic::vector_reduce_umin:
3805 case Intrinsic::vector_reduce_umax:
3806 break;
3807 default:
3808 return false;
3809 }
3810
3811 // Find all the inputs when looking through operations that do not alter the
3812 // lane order (binops, for example). Currently we look for a single shuffle,
3813 // and can ignore splat values.
3814 std::queue<Value *> Worklist;
3815 SmallPtrSet<Value *, 4> Visited;
3816 ShuffleVectorInst *Shuffle = nullptr;
3817 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3818 Worklist.push(Op);
3819
3820 while (!Worklist.empty()) {
3821 Value *CV = Worklist.front();
3822 Worklist.pop();
3823 if (Visited.contains(CV))
3824 continue;
3825
3826 // Splats don't change the order, so can be safely ignored.
3827 if (isSplatValue(CV))
3828 continue;
3829
3830 Visited.insert(CV);
3831
3832 if (auto *CI = dyn_cast<Instruction>(CV)) {
3833 if (CI->isBinaryOp()) {
3834 for (auto *Op : CI->operand_values())
3835 Worklist.push(Op);
3836 continue;
3837 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3838 if (Shuffle && Shuffle != SV)
3839 return false;
3840 Shuffle = SV;
3841 continue;
3842 }
3843 }
3844
3845 // Anything else is currently an unknown node.
3846 return false;
3847 }
3848
3849 if (!Shuffle)
3850 return false;
3851
3852 // Check all uses of the binary ops and shuffles are also included in the
3853 // lane-invariant operations (Visited should be the list of lanewise
3854 // instructions, including the shuffle that we found).
3855 for (auto *V : Visited)
3856 for (auto *U : V->users())
3857 if (!Visited.contains(U) && U != &I)
3858 return false;
3859
3860 FixedVectorType *VecType =
3861 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3862 if (!VecType)
3863 return false;
3864 FixedVectorType *ShuffleInputType =
3866 if (!ShuffleInputType)
3867 return false;
3868 unsigned NumInputElts = ShuffleInputType->getNumElements();
3869
3870 // Find the mask from sorting the lanes into order. This is most likely to
3871 // become a identity or concat mask. Undef elements are pushed to the end.
3872 SmallVector<int> ConcatMask;
3873 Shuffle->getShuffleMask(ConcatMask);
3874 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3875 bool UsesSecondVec =
3876 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3877
3879 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3880 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3882 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3883 ShuffleInputType, ConcatMask, CostKind);
3884
3885 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3886 << "\n");
3887 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3888 << "\n");
3889 bool MadeChanges = false;
3890 if (NewCost < OldCost) {
3891 Builder.SetInsertPoint(Shuffle);
3892 Value *NewShuffle = Builder.CreateShuffleVector(
3893 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3894 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3895 replaceValue(*Shuffle, *NewShuffle);
3896 return true;
3897 }
3898
3899 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3900 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3901 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3902 return MadeChanges;
3903}
3904
3905/// For a given chain of patterns of the following form:
3906///
3907/// ```
3908/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3909///
3910/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3911/// ty1> %1)
3912/// OR
3913/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3914///
3915/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3916/// ...
3917/// ...
3918/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3919/// 3), <n x ty1> %(i - 2)
3920/// OR
3921/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3922///
3923/// %(i) = extractelement <n x ty1> %(i - 1), 0
3924/// ```
3925///
3926/// Where:
3927/// `mask` follows a partition pattern:
3928///
3929/// Ex:
3930/// [n = 8, p = poison]
3931///
3932/// 4 5 6 7 | p p p p
3933/// 2 3 | p p p p p p
3934/// 1 | p p p p p p p
3935///
3936/// For powers of 2, there's a consistent pattern, but for other cases
3937/// the parity of the current half value at each step decides the
3938/// next partition half (see `ExpectedParityMask` for more logical details
3939/// in generalising this).
3940///
3941/// Ex:
3942/// [n = 6]
3943///
3944/// 3 4 5 | p p p
3945/// 1 2 | p p p p
3946/// 1 | p p p p p
3947bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3948 // Going bottom-up for the pattern.
3949 std::queue<Value *> InstWorklist;
3950 InstructionCost OrigCost = 0;
3951
3952 // Common instruction operation after each shuffle op.
3953 std::optional<unsigned int> CommonCallOp = std::nullopt;
3954 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3955
3956 bool IsFirstCallOrBinInst = true;
3957 bool ShouldBeCallOrBinInst = true;
3958
3959 // This stores the last used instructions for shuffle/common op.
3960 //
3961 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3962 // instructions from either shuffle/common op.
3963 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3964
3965 Value *VecOpEE;
3966 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3967 return false;
3968
3969 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3970 if (!FVT)
3971 return false;
3972
3973 int64_t VecSize = FVT->getNumElements();
3974 if (VecSize < 2)
3975 return false;
3976
3977 // Number of levels would be ~log2(n), considering we always partition
3978 // by half for this fold pattern.
3979 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3980 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3981
3982 // This is how we generalise for all element sizes.
3983 // At each step, if vector size is odd, we need non-poison
3984 // values to cover the dominant half so we don't miss out on any element.
3985 //
3986 // This mask will help us retrieve this as we go from bottom to top:
3987 //
3988 // Mask Set -> N = N * 2 - 1
3989 // Mask Unset -> N = N * 2
3990 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3991 Cur = (Cur + 1) / 2, --Mask) {
3992 if (Cur & 1)
3993 ExpectedParityMask |= (1ll << Mask);
3994 }
3995
3996 InstWorklist.push(VecOpEE);
3997
3998 while (!InstWorklist.empty()) {
3999 Value *CI = InstWorklist.front();
4000 InstWorklist.pop();
4001
4002 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
4003 if (!ShouldBeCallOrBinInst)
4004 return false;
4005
4006 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4007 return false;
4008
4009 // For the first found call/bin op, the vector has to come from the
4010 // extract element op.
4011 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4012 return false;
4013 IsFirstCallOrBinInst = false;
4014
4015 if (!CommonCallOp)
4016 CommonCallOp = II->getIntrinsicID();
4017 if (II->getIntrinsicID() != *CommonCallOp)
4018 return false;
4019
4020 switch (II->getIntrinsicID()) {
4021 case Intrinsic::umin:
4022 case Intrinsic::umax:
4023 case Intrinsic::smin:
4024 case Intrinsic::smax: {
4025 auto *Op0 = II->getOperand(0);
4026 auto *Op1 = II->getOperand(1);
4027 PrevVecV[0] = Op0;
4028 PrevVecV[1] = Op1;
4029 break;
4030 }
4031 default:
4032 return false;
4033 }
4034 ShouldBeCallOrBinInst ^= 1;
4035
4036 IntrinsicCostAttributes ICA(
4037 *CommonCallOp, II->getType(),
4038 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
4039 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4040
4041 // We may need a swap here since it can be (a, b) or (b, a)
4042 // and accordingly change as we go up.
4043 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4044 std::swap(PrevVecV[0], PrevVecV[1]);
4045 InstWorklist.push(PrevVecV[1]);
4046 InstWorklist.push(PrevVecV[0]);
4047 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
4048 // Similar logic for bin ops.
4049
4050 if (!ShouldBeCallOrBinInst)
4051 return false;
4052
4053 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4054 return false;
4055
4056 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4057 return false;
4058 IsFirstCallOrBinInst = false;
4059
4060 if (!CommonBinOp)
4061 CommonBinOp = BinOp->getOpcode();
4062
4063 if (BinOp->getOpcode() != *CommonBinOp)
4064 return false;
4065
4066 switch (*CommonBinOp) {
4067 case BinaryOperator::Add:
4068 case BinaryOperator::Mul:
4069 case BinaryOperator::Or:
4070 case BinaryOperator::And:
4071 case BinaryOperator::Xor: {
4072 auto *Op0 = BinOp->getOperand(0);
4073 auto *Op1 = BinOp->getOperand(1);
4074 PrevVecV[0] = Op0;
4075 PrevVecV[1] = Op1;
4076 break;
4077 }
4078 default:
4079 return false;
4080 }
4081 ShouldBeCallOrBinInst ^= 1;
4082
4083 OrigCost +=
4084 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
4085
4086 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4087 std::swap(PrevVecV[0], PrevVecV[1]);
4088 InstWorklist.push(PrevVecV[1]);
4089 InstWorklist.push(PrevVecV[0]);
4090 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
4091 // We shouldn't have any null values in the previous vectors,
4092 // is so, there was a mismatch in pattern.
4093 if (ShouldBeCallOrBinInst || any_of(PrevVecV, equal_to(nullptr)))
4094 return false;
4095
4096 if (SVInst != PrevVecV[1])
4097 return false;
4098
4099 ArrayRef<int> CurMask;
4100 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
4101 m_Mask(CurMask))))
4102 return false;
4103
4104 // Subtract the parity mask when checking the condition.
4105 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
4106 if (Mask < ShuffleMaskHalf &&
4107 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
4108 return false;
4109 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
4110 return false;
4111 }
4112
4113 // Update mask values.
4114 ShuffleMaskHalf *= 2;
4115 ShuffleMaskHalf -= (ExpectedParityMask & 1);
4116 ExpectedParityMask >>= 1;
4117
4119 SVInst->getType(), SVInst->getType(),
4120 CurMask, CostKind);
4121
4122 VisitedCnt += 1;
4123 if (!ExpectedParityMask && VisitedCnt == NumLevels)
4124 break;
4125
4126 ShouldBeCallOrBinInst ^= 1;
4127 } else {
4128 return false;
4129 }
4130 }
4131
4132 // Pattern should end with a shuffle op.
4133 if (ShouldBeCallOrBinInst)
4134 return false;
4135
4136 assert(VecSize != -1 && "Expected Match for Vector Size");
4137
4138 Value *FinalVecV = PrevVecV[0];
4139 if (!FinalVecV)
4140 return false;
4141
4142 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
4143
4144 Intrinsic::ID ReducedOp =
4145 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4146 : getReductionForBinop(*CommonBinOp));
4147 if (!ReducedOp)
4148 return false;
4149
4150 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
4152
4153 if (NewCost >= OrigCost)
4154 return false;
4155
4156 auto *ReducedResult =
4157 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
4158 replaceValue(I, *ReducedResult);
4159
4160 return true;
4161}
4162
4163/// Determine if its more efficient to fold:
4164/// reduce(trunc(x)) -> trunc(reduce(x)).
4165/// reduce(sext(x)) -> sext(reduce(x)).
4166/// reduce(zext(x)) -> zext(reduce(x)).
4167bool VectorCombine::foldCastFromReductions(Instruction &I) {
4168 auto *II = dyn_cast<IntrinsicInst>(&I);
4169 if (!II)
4170 return false;
4171
4172 bool TruncOnly = false;
4173 Intrinsic::ID IID = II->getIntrinsicID();
4174 switch (IID) {
4175 case Intrinsic::vector_reduce_add:
4176 case Intrinsic::vector_reduce_mul:
4177 TruncOnly = true;
4178 break;
4179 case Intrinsic::vector_reduce_and:
4180 case Intrinsic::vector_reduce_or:
4181 case Intrinsic::vector_reduce_xor:
4182 break;
4183 default:
4184 return false;
4185 }
4186
4187 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4188 Value *ReductionSrc = I.getOperand(0);
4189
4190 Value *Src;
4191 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4192 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4193 return false;
4194
4195 auto CastOpc =
4196 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4197
4198 auto *SrcTy = cast<VectorType>(Src->getType());
4199 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4200 Type *ResultTy = I.getType();
4201
4203 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4204 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4206 cast<CastInst>(ReductionSrc));
4207 InstructionCost NewCost =
4208 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4209 CostKind) +
4210 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4212
4213 if (OldCost <= NewCost || !NewCost.isValid())
4214 return false;
4215
4216 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4217 II->getIntrinsicID(), {Src});
4218 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4219 replaceValue(I, *NewCast);
4220 return true;
4221}
4222
4223/// Fold:
4224/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4225/// into:
4226/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4227///
4228/// Sign-bit reductions produce values with known semantics:
4229/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4230/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4231/// - reduce.add: count of negative elements (0 to NumElts)
4232///
4233/// Both lshr and ashr are supported:
4234/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4235/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4236///
4237/// The fold generalizes to multiple source vectors combined with the same
4238/// operation as the reduction. For example:
4239/// reduce.or(or(shr A, shr B)) conceptually extends the vector
4240/// For reduce.add, this changes the count to M*N where M is the number of
4241/// source vectors.
4242///
4243/// We transform to a direct sign check on the original vector using
4244/// reduce.{or,umax} or reduce.{and,umin}.
4245///
4246/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4247bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4248 CmpPredicate Pred;
4249 IntrinsicInst *ReduceOp;
4250 const APInt *CmpVal;
4251 if (!match(&I,
4252 m_ICmp(Pred, m_OneUse(m_AnyIntrinsic(ReduceOp)), m_APInt(CmpVal))))
4253 return false;
4254
4255 Intrinsic::ID OrigIID = ReduceOp->getIntrinsicID();
4256 switch (OrigIID) {
4257 case Intrinsic::vector_reduce_or:
4258 case Intrinsic::vector_reduce_umax:
4259 case Intrinsic::vector_reduce_and:
4260 case Intrinsic::vector_reduce_umin:
4261 case Intrinsic::vector_reduce_add:
4262 break;
4263 default:
4264 return false;
4265 }
4266
4267 Value *ReductionSrc = ReduceOp->getArgOperand(0);
4268 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4269 if (!VecTy)
4270 return false;
4271
4272 unsigned BitWidth = VecTy->getScalarSizeInBits();
4273 if (BitWidth == 1)
4274 return false;
4275
4276 unsigned NumElts = VecTy->getNumElements();
4277
4278 // Determine the expected tree opcode for multi-vector patterns.
4279 // The tree opcode must match the reduction's underlying operation.
4280 //
4281 // TODO: for pairs of equivalent operators, we should match both,
4282 // not only the most common.
4283 Instruction::BinaryOps TreeOpcode;
4284 switch (OrigIID) {
4285 case Intrinsic::vector_reduce_or:
4286 case Intrinsic::vector_reduce_umax:
4287 TreeOpcode = Instruction::Or;
4288 break;
4289 case Intrinsic::vector_reduce_and:
4290 case Intrinsic::vector_reduce_umin:
4291 TreeOpcode = Instruction::And;
4292 break;
4293 case Intrinsic::vector_reduce_add:
4294 TreeOpcode = Instruction::Add;
4295 break;
4296 default:
4297 llvm_unreachable("Unexpected intrinsic");
4298 }
4299
4300 // Collect sign-bit extraction leaves from an associative tree of TreeOpcode.
4301 // The tree conceptually extends the vector being reduced.
4302 SmallVector<Value *, 8> Worklist;
4303 SmallVector<Value *, 8> Sources; // Original vectors (X in shr X, BW-1)
4304 Worklist.push_back(ReductionSrc);
4305 std::optional<bool> IsAShr;
4306 constexpr unsigned MaxSources = 8;
4307
4308 // Calculate old cost: all shifts + tree ops + reduction
4309 InstructionCost OldCost = TTI.getInstructionCost(ReduceOp, CostKind);
4310
4311 while (!Worklist.empty() && Worklist.size() <= MaxSources &&
4312 Sources.size() <= MaxSources) {
4313 Value *V = Worklist.pop_back_val();
4314
4315 // Try to match sign-bit extraction: shr X, (bitwidth-1)
4316 Value *X;
4317 if (match(V, m_OneUse(m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))) {
4318 auto *Shr = cast<Instruction>(V);
4319
4320 // All shifts must be the same type (all lshr or all ashr)
4321 bool ThisIsAShr = Shr->getOpcode() == Instruction::AShr;
4322 if (!IsAShr)
4323 IsAShr = ThisIsAShr;
4324 else if (*IsAShr != ThisIsAShr)
4325 return false;
4326
4327 Sources.push_back(X);
4328
4329 // As part of the fold, we remove all of the shifts, so we need to keep
4330 // track of their costs.
4331 OldCost += TTI.getInstructionCost(Shr, CostKind);
4332
4333 continue;
4334 }
4335
4336 // Try to extend through a tree node of the expected opcode
4337 Value *A, *B;
4338 if (!match(V, m_OneUse(m_BinOp(TreeOpcode, m_Value(A), m_Value(B)))))
4339 return false;
4340
4341 // We are potentially replacing these operations as well, so we add them
4342 // to the costs.
4344
4345 Worklist.push_back(A);
4346 Worklist.push_back(B);
4347 }
4348
4349 // Must have at least one source and not exceed limit
4350 if (Sources.empty() || Sources.size() > MaxSources ||
4351 Worklist.size() > MaxSources || !IsAShr)
4352 return false;
4353
4354 unsigned NumSources = Sources.size();
4355
4356 // For reduce.add, the total count must fit as a signed integer.
4357 // Range is [0, M*N] for lshr or [-M*N, 0] for ashr.
4358 if (OrigIID == Intrinsic::vector_reduce_add &&
4359 !isIntN(BitWidth, NumSources * NumElts))
4360 return false;
4361
4362 // Compute the boundary value when all elements are negative:
4363 // - Per-element contribution: 1 for lshr, -1 for ashr
4364 // - For add: M*N (total elements across all sources); for others: just 1
4365 unsigned Count =
4366 (OrigIID == Intrinsic::vector_reduce_add) ? NumSources * NumElts : 1;
4367 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4368 if (*IsAShr)
4369 NegativeVal.negate();
4370
4371 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4372 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4373 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4374 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4375
4376 // Determine comparison semantics:
4377 // - IsEq: true for equality test, false for inequality
4378 // - TestsNegative: true if testing against AllNegVal, false for zero
4379 //
4380 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4381 // that fold to boundary tests given the narrow value range:
4382 // < RangeHigh -> != RangeHigh
4383 // > RangeHigh-1 -> == RangeHigh
4384 // > RangeLow -> != RangeLow
4385 // < RangeLow+1 -> == RangeLow
4386 //
4387 // For inequalities, we work with signed predicates only. Unsigned predicates
4388 // are canonicalized to signed when the range is non-negative (where they are
4389 // equivalent). When the range includes negative values, unsigned predicates
4390 // would have different semantics due to wrap-around, so we reject them.
4391 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4392 if (RangeLow.isNegative())
4393 return false;
4394 Pred = ICmpInst::getSignedPredicate(Pred);
4395 }
4396
4397 bool IsEq;
4398 bool TestsNegative;
4399 if (ICmpInst::isEquality(Pred)) {
4400 if (CmpVal->isZero()) {
4401 TestsNegative = false;
4402 } else if (*CmpVal == NegativeVal) {
4403 TestsNegative = true;
4404 } else {
4405 return false;
4406 }
4407 IsEq = Pred == ICmpInst::ICMP_EQ;
4408 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4409 IsEq = false;
4410 TestsNegative = (RangeHigh == NegativeVal);
4411 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4412 IsEq = true;
4413 TestsNegative = (RangeHigh == NegativeVal);
4414 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4415 IsEq = false;
4416 TestsNegative = (RangeLow == NegativeVal);
4417 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4418 IsEq = true;
4419 TestsNegative = (RangeLow == NegativeVal);
4420 } else {
4421 return false;
4422 }
4423
4424 // For this fold we support four types of checks:
4425 //
4426 // 1. All lanes are negative - AllNeg
4427 // 2. All lanes are non-negative - AllNonNeg
4428 // 3. At least one negative lane - AnyNeg
4429 // 4. At least one non-negative lane - AnyNonNeg
4430 //
4431 // For each case, we can generate the following code:
4432 //
4433 // 1. AllNeg - reduce.and/umin(X) < 0
4434 // 2. AllNonNeg - reduce.or/umax(X) > -1
4435 // 3. AnyNeg - reduce.or/umax(X) < 0
4436 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4437 //
4438 // The table below shows the aggregation of all supported cases
4439 // using these four cases.
4440 //
4441 // Reduction | == 0 | != 0 | == MAX | != MAX
4442 // ------------+-----------+-----------+-----------+-----------
4443 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4444 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4445 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4446 //
4447 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4448 //
4449 // For easier codegen and check inversion, we use the following encoding:
4450 //
4451 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4452 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4453 // 3. Bit-1 === universal (1) or existential (0) check
4454 //
4455 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4456 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4457 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4458 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4459 //
4460 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4461 //
4462 enum CheckKind : unsigned {
4463 AnyNonNeg = 0b000,
4464 AllNeg = 0b011,
4465 AllNonNeg = 0b101,
4466 AnyNeg = 0b110,
4467 };
4468 // Return true if we fold this check into or/umax and false for and/umin
4469 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4470 // Return true if we should check if result is negative and false otherwise
4471 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4472 // Logically invert the check
4473 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4474
4475 CheckKind Base;
4476 switch (OrigIID) {
4477 case Intrinsic::vector_reduce_or:
4478 case Intrinsic::vector_reduce_umax:
4479 Base = TestsNegative ? AnyNeg : AllNonNeg;
4480 break;
4481 case Intrinsic::vector_reduce_and:
4482 case Intrinsic::vector_reduce_umin:
4483 Base = TestsNegative ? AllNeg : AnyNonNeg;
4484 break;
4485 case Intrinsic::vector_reduce_add:
4486 Base = TestsNegative ? AllNeg : AllNonNeg;
4487 break;
4488 default:
4489 llvm_unreachable("Unexpected intrinsic");
4490 }
4491
4492 CheckKind Check = IsEq ? Base : Invert(Base);
4493
4494 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4495 InstructionCost ArithCost =
4497 VecTy, std::nullopt, CostKind);
4498 InstructionCost MinMaxCost =
4500 FastMathFlags(), CostKind);
4501 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4502 : std::make_pair(MinMax, MinMaxCost);
4503 };
4504
4505 // Choose output reduction based on encoding's MSB
4506 auto [NewIID, NewCost] = RequiresOr(Check)
4507 ? PickCheaper(Intrinsic::vector_reduce_or,
4508 Intrinsic::vector_reduce_umax)
4509 : PickCheaper(Intrinsic::vector_reduce_and,
4510 Intrinsic::vector_reduce_umin);
4511
4512 // Add cost of combining multiple sources with or/and
4513 if (NumSources > 1) {
4514 unsigned CombineOpc =
4515 RequiresOr(Check) ? Instruction::Or : Instruction::And;
4516 NewCost += TTI.getArithmeticInstrCost(CombineOpc, VecTy, CostKind) *
4517 (NumSources - 1);
4518 }
4519
4520 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4521 << OldCost << " vs NewCost: " << NewCost << "\n");
4522
4523 if (NewCost > OldCost)
4524 return false;
4525
4526 // Generate the combined input and reduction
4527 Builder.SetInsertPoint(&I);
4528 Type *ScalarTy = VecTy->getScalarType();
4529
4530 Value *Input;
4531 if (NumSources == 1) {
4532 Input = Sources[0];
4533 } else {
4534 // Combine sources with or/and based on check type
4535 Input = RequiresOr(Check) ? Builder.CreateOr(Sources)
4536 : Builder.CreateAnd(Sources);
4537 }
4538
4539 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {Input});
4540 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4541 : Builder.CreateIsNotNeg(NewReduce);
4542 replaceValue(I, *NewCmp);
4543 return true;
4544}
4545
4546/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4547///
4548/// We can prove it for cases when:
4549///
4550/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4551/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4552/// 2. f(x) == 0 <=> x == 0
4553///
4554/// From 1 and 2 (or 1' and 2), we can infer that
4555///
4556/// OP f(X_i) == 0 <=> OP X_i == 0.
4557///
4558/// (1)
4559/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4560/// (2)
4561/// <=> \forall i \in [1, N] X_i == 0
4562/// (1)
4563/// <=> OP(X_i) == 0
4564///
4565/// For some of the OP's and f's, we need to have domain constraints on X
4566/// to ensure properties 1 (or 1') and 2.
4567bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4568 CmpPredicate Pred;
4569 Value *Op;
4570 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4571 !ICmpInst::isEquality(Pred))
4572 return false;
4573
4574 auto *II = dyn_cast<IntrinsicInst>(Op);
4575 if (!II)
4576 return false;
4577
4578 switch (II->getIntrinsicID()) {
4579 case Intrinsic::vector_reduce_add:
4580 case Intrinsic::vector_reduce_or:
4581 case Intrinsic::vector_reduce_umin:
4582 case Intrinsic::vector_reduce_umax:
4583 case Intrinsic::vector_reduce_smin:
4584 case Intrinsic::vector_reduce_smax:
4585 break;
4586 default:
4587 return false;
4588 }
4589
4590 Value *InnerOp = II->getArgOperand(0);
4591
4592 // TODO: fixed vector type might be too restrictive
4593 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4594 return false;
4595
4596 Value *X = nullptr;
4597
4598 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4599 //
4600 // 1. f(x) = shl nuw x, y for arbitrary y
4601 // 2. f(x) = mul nuw x, c for defined c != 0
4602 // 3. f(x) = zext x
4603 // 4. f(x) = sext x
4604 // 5. f(x) = neg x
4605 //
4606 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4607 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4608 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4609 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4610 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4611 ))
4612 return false;
4613
4614 SimplifyQuery S = SQ.getWithInstruction(&I);
4615 auto *XTy = cast<FixedVectorType>(X->getType());
4616
4617 // Check for domain constraints for all supported reductions.
4618 //
4619 // a. OR X_i - has property 1 for every X
4620 // b. UMAX X_i - has property 1 for every X
4621 // c. UMIN X_i - has property 1' for every X
4622 // d. SMAX X_i - has property 1 for X >= 0
4623 // e. SMIN X_i - has property 1' for X >= 0
4624 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4625 //
4626 // In order for the proof to work, we need 1 (or 1') to be true for both
4627 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4628 //
4629 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4630 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4631 // of known bits, we can't reasonably hold knowledge of "either 0
4632 // or negative".
4633 switch (II->getIntrinsicID()) {
4634 case Intrinsic::vector_reduce_add: {
4635 // We need to check that both X_i and f(X_i) have enough leading
4636 // zeros to not overflow.
4637 KnownBits KnownX = computeKnownBits(X, S);
4638 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4639 unsigned NumElems = XTy->getNumElements();
4640 // Adding N elements loses at most ceil(log2(N)) leading bits.
4641 unsigned LostBits = Log2_32_Ceil(NumElems);
4642 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4643 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4644 // Need at least one leading zero left after summation to ensure no overflow
4645 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4646 return false;
4647
4648 // We are not checking whether X or f(X) are positive explicitly because
4649 // we implicitly checked for it when we checked if both cases have enough
4650 // leading zeros to not wrap addition.
4651 break;
4652 }
4653 case Intrinsic::vector_reduce_smin:
4654 case Intrinsic::vector_reduce_smax:
4655 // Check whether X >= 0 and f(X) >= 0
4656 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4657 return false;
4658
4659 break;
4660 default:
4661 break;
4662 };
4663
4664 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4665 << *II << "\n");
4666
4667 // For zext/sext, check if the transform is profitable using cost model.
4668 // For other operations (shl, mul, neg), we're removing an instruction
4669 // while keeping the same reduction type, so it's always profitable.
4670 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4671 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4672 Intrinsic::ID IID = II->getIntrinsicID();
4673
4675 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4677
4678 InstructionCost OldReduceCost, NewReduceCost;
4679 switch (IID) {
4680 case Intrinsic::vector_reduce_add:
4681 case Intrinsic::vector_reduce_or:
4682 OldReduceCost = TTI.getArithmeticReductionCost(
4683 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4684 NewReduceCost = TTI.getArithmeticReductionCost(
4685 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4686 break;
4687 case Intrinsic::vector_reduce_umin:
4688 case Intrinsic::vector_reduce_umax:
4689 case Intrinsic::vector_reduce_smin:
4690 case Intrinsic::vector_reduce_smax:
4691 OldReduceCost = TTI.getMinMaxReductionCost(
4692 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4693 NewReduceCost = TTI.getMinMaxReductionCost(
4694 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4695 break;
4696 default:
4697 llvm_unreachable("Unexpected reduction");
4698 }
4699
4700 InstructionCost OldCost = OldReduceCost + ExtCost;
4701 InstructionCost NewCost =
4702 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4703
4704 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4705 << *InnerOp << "\n OldCost: " << OldCost
4706 << " vs NewCost: " << NewCost << "\n");
4707
4708 // We consider transformation to still be potentially beneficial even
4709 // when the costs are the same because we might remove a use from f(X)
4710 // and unlock other optimizations. Equal costs would just mean that we
4711 // didn't make it worse in the worst case.
4712 if (NewCost > OldCost)
4713 return false;
4714 }
4715
4716 // Since we support zext and sext as f, we might change the scalar type
4717 // of the intrinsic.
4718 Type *Ty = XTy->getScalarType();
4719 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4720 Value *NewCmp =
4721 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4722 replaceValue(I, *NewCmp);
4723 return true;
4724}
4725
4726/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4727/// based on cost, preserving the comparison semantics.
4728///
4729/// We use two fundamental properties for each pair:
4730///
4731/// 1. or(X) == 0 <=> umax(X) == 0
4732/// 2. or(X) == 1 <=> umax(X) == 1
4733/// 3. sign(or(X)) == sign(umax(X))
4734///
4735/// 1. and(X) == -1 <=> umin(X) == -1
4736/// 2. and(X) == -2 <=> umin(X) == -2
4737/// 3. sign(and(X)) == sign(umin(X))
4738///
4739/// From these we can infer the following transformations:
4740/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4741/// b. or(X) s< 0 <-> umax(X) s< 0
4742/// c. or(X) s> -1 <-> umax(X) s> -1
4743/// d. or(X) s< 1 <-> umax(X) s< 1
4744/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4745/// f. or(X) s< 2 <-> umax(X) s< 2
4746/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4747/// h. and(X) s< 0 <-> umin(X) s< 0
4748/// i. and(X) s> -1 <-> umin(X) s> -1
4749/// j. and(X) s> -2 <-> umin(X) s> -2
4750/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4751/// l. and(X) s> -3 <-> umin(X) s> -3
4752///
4753bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4754 CmpPredicate Pred;
4755 Value *ReduceOp;
4756 const APInt *CmpVal;
4757 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4758 return false;
4759
4760 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4761 if (!II || !II->hasOneUse())
4762 return false;
4763
4764 const auto IsValidOrUmaxCmp = [&]() {
4765 // or === umax for i1
4766 if (CmpVal->getBitWidth() == 1)
4767 return true;
4768
4769 // Cases a and e
4770 bool IsEquality =
4771 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4772 // Case c
4773 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4774 // Cases b, d, and f
4775 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4776 Pred == ICmpInst::ICMP_SLT;
4777 return IsEquality || IsPositive || IsNegative;
4778 };
4779
4780 const auto IsValidAndUminCmp = [&]() {
4781 // and === umin for i1
4782 if (CmpVal->getBitWidth() == 1)
4783 return true;
4784
4785 const auto LeadingOnes = CmpVal->countl_one();
4786
4787 // Cases g and k
4788 bool IsEquality =
4789 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4791 // Case h
4792 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4793 // Cases i, j, and l
4794 bool IsPositive =
4795 // if the number has at least N - 2 leading ones
4796 // and the two LSBs are:
4797 // - 1 x 1 -> -1
4798 // - 1 x 0 -> -2
4799 // - 0 x 1 -> -3
4800 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4801 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4802 return IsEquality || IsNegative || IsPositive;
4803 };
4804
4805 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4806 Intrinsic::ID AlternativeIID;
4807
4808 // Check if this is a valid comparison pattern and determine the alternate
4809 // reduction intrinsic.
4810 switch (OriginalIID) {
4811 case Intrinsic::vector_reduce_or:
4812 if (!IsValidOrUmaxCmp())
4813 return false;
4814 AlternativeIID = Intrinsic::vector_reduce_umax;
4815 break;
4816 case Intrinsic::vector_reduce_umax:
4817 if (!IsValidOrUmaxCmp())
4818 return false;
4819 AlternativeIID = Intrinsic::vector_reduce_or;
4820 break;
4821 case Intrinsic::vector_reduce_and:
4822 if (!IsValidAndUminCmp())
4823 return false;
4824 AlternativeIID = Intrinsic::vector_reduce_umin;
4825 break;
4826 case Intrinsic::vector_reduce_umin:
4827 if (!IsValidAndUminCmp())
4828 return false;
4829 AlternativeIID = Intrinsic::vector_reduce_and;
4830 break;
4831 default:
4832 return false;
4833 }
4834
4835 Value *X = II->getArgOperand(0);
4836 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4837 if (!VecTy)
4838 return false;
4839
4840 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4841 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4842 if (ReductionOpc != Instruction::ICmp)
4843 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4844 CostKind);
4846 FastMathFlags(), CostKind);
4847 };
4848
4849 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4850 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4851
4852 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4853 << "\n OrigCost: " << OrigCost
4854 << " vs AltCost: " << AltCost << "\n");
4855
4856 if (AltCost >= OrigCost)
4857 return false;
4858
4859 Builder.SetInsertPoint(&I);
4860 Type *ScalarTy = VecTy->getScalarType();
4861 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
4862 Value *NewCmp =
4863 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
4864
4865 replaceValue(I, *NewCmp);
4866 return true;
4867}
4868
4869/// Returns true if this ShuffleVectorInst eventually feeds into a
4870/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4871/// chains of shuffles and binary operators (in any combination/order).
4872/// The search does not go deeper than the given Depth.
4874 constexpr unsigned MaxVisited = 32;
4877 bool FoundReduction = false;
4878
4879 WorkList.push_back(SVI);
4880 while (!WorkList.empty()) {
4881 Instruction *I = WorkList.pop_back_val();
4882 for (User *U : I->users()) {
4883 auto *UI = cast<Instruction>(U);
4884 if (!UI || !Visited.insert(UI).second)
4885 continue;
4886 if (Visited.size() > MaxVisited)
4887 return false;
4888 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4889 // More than one reduction reached
4890 if (FoundReduction)
4891 return false;
4892 switch (II->getIntrinsicID()) {
4893 case Intrinsic::vector_reduce_add:
4894 case Intrinsic::vector_reduce_mul:
4895 case Intrinsic::vector_reduce_and:
4896 case Intrinsic::vector_reduce_or:
4897 case Intrinsic::vector_reduce_xor:
4898 case Intrinsic::vector_reduce_smin:
4899 case Intrinsic::vector_reduce_smax:
4900 case Intrinsic::vector_reduce_umin:
4901 case Intrinsic::vector_reduce_umax:
4902 FoundReduction = true;
4903 continue;
4904 default:
4905 return false;
4906 }
4907 }
4908
4910 return false;
4911
4912 WorkList.emplace_back(UI);
4913 }
4914 }
4915 return FoundReduction;
4916}
4917
4918/// This method looks for groups of shuffles acting on binops, of the form:
4919/// %x = shuffle ...
4920/// %y = shuffle ...
4921/// %a = binop %x, %y
4922/// %b = binop %x, %y
4923/// shuffle %a, %b, selectmask
4924/// We may, especially if the shuffle is wider than legal, be able to convert
4925/// the shuffle to a form where only parts of a and b need to be computed. On
4926/// architectures with no obvious "select" shuffle, this can reduce the total
4927/// number of operations if the target reports them as cheaper.
4928bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4929 auto *SVI = cast<ShuffleVectorInst>(&I);
4930 auto *VT = cast<FixedVectorType>(I.getType());
4931 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4932 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4933 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4934 VT != Op0->getType())
4935 return false;
4936
4937 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4938 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4939 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4940 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4941 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4942 auto checkSVNonOpUses = [&](Instruction *I) {
4943 if (!I || I->getOperand(0)->getType() != VT)
4944 return true;
4945 return any_of(I->users(), [&](User *U) {
4946 return U != Op0 && U != Op1 &&
4947 !(isa<ShuffleVectorInst>(U) &&
4948 (InputShuffles.contains(cast<Instruction>(U)) ||
4949 isInstructionTriviallyDead(cast<Instruction>(U))));
4950 });
4951 };
4952 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4953 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4954 return false;
4955
4956 // Collect all the uses that are shuffles that we can transform together. We
4957 // may not have a single shuffle, but a group that can all be transformed
4958 // together profitably.
4960 auto collectShuffles = [&](Instruction *I) {
4961 for (auto *U : I->users()) {
4962 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4963 if (!SV || SV->getType() != VT)
4964 return false;
4965 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4966 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4967 return false;
4968 if (!llvm::is_contained(Shuffles, SV))
4969 Shuffles.push_back(SV);
4970 }
4971 return true;
4972 };
4973 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4974 return false;
4975 // From a reduction, we need to be processing a single shuffle, otherwise the
4976 // other uses will not be lane-invariant.
4977 if (FromReduction && Shuffles.size() > 1)
4978 return false;
4979
4980 // Add any shuffle uses for the shuffles we have found, to include them in our
4981 // cost calculations.
4982 if (!FromReduction) {
4983 for (ShuffleVectorInst *SV : Shuffles) {
4984 for (auto *U : SV->users()) {
4985 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4986 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4987 Shuffles.push_back(SSV);
4988 }
4989 }
4990 }
4991
4992 // For each of the output shuffles, we try to sort all the first vector
4993 // elements to the beginning, followed by the second array elements at the
4994 // end. If the binops are legalized to smaller vectors, this may reduce total
4995 // number of binops. We compute the ReconstructMask mask needed to convert
4996 // back to the original lane order.
4998 SmallVector<SmallVector<int>> OrigReconstructMasks;
4999 int MaxV1Elt = 0, MaxV2Elt = 0;
5000 unsigned NumElts = VT->getNumElements();
5001 for (ShuffleVectorInst *SVN : Shuffles) {
5002 SmallVector<int> Mask;
5003 SVN->getShuffleMask(Mask);
5004
5005 // Check the operands are the same as the original, or reversed (in which
5006 // case we need to commute the mask).
5007 Value *SVOp0 = SVN->getOperand(0);
5008 Value *SVOp1 = SVN->getOperand(1);
5009 if (isa<UndefValue>(SVOp1)) {
5010 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
5011 SVOp0 = SSV->getOperand(0);
5012 SVOp1 = SSV->getOperand(1);
5013 for (int &Elem : Mask) {
5014 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
5015 return false;
5016 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
5017 }
5018 }
5019 if (SVOp0 == Op1 && SVOp1 == Op0) {
5020 std::swap(SVOp0, SVOp1);
5022 }
5023 if (SVOp0 != Op0 || SVOp1 != Op1)
5024 return false;
5025
5026 // Calculate the reconstruction mask for this shuffle, as the mask needed to
5027 // take the packed values from Op0/Op1 and reconstructing to the original
5028 // order.
5029 SmallVector<int> ReconstructMask;
5030 for (unsigned I = 0; I < Mask.size(); I++) {
5031 if (Mask[I] < 0) {
5032 ReconstructMask.push_back(-1);
5033 } else if (Mask[I] < static_cast<int>(NumElts)) {
5034 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
5035 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
5036 return Mask[I] == A.first;
5037 });
5038 if (It != V1.end())
5039 ReconstructMask.push_back(It - V1.begin());
5040 else {
5041 ReconstructMask.push_back(V1.size());
5042 V1.emplace_back(Mask[I], V1.size());
5043 }
5044 } else {
5045 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
5046 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
5047 return Mask[I] - static_cast<int>(NumElts) == A.first;
5048 });
5049 if (It != V2.end())
5050 ReconstructMask.push_back(NumElts + It - V2.begin());
5051 else {
5052 ReconstructMask.push_back(NumElts + V2.size());
5053 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
5054 }
5055 }
5056 }
5057
5058 // For reductions, we know that the lane ordering out doesn't alter the
5059 // result. In-order can help simplify the shuffle away.
5060 if (FromReduction)
5061 sort(ReconstructMask);
5062 OrigReconstructMasks.push_back(std::move(ReconstructMask));
5063 }
5064
5065 // If the Maximum element used from V1 and V2 are not larger than the new
5066 // vectors, the vectors are already packes and performing the optimization
5067 // again will likely not help any further. This also prevents us from getting
5068 // stuck in a cycle in case the costs do not also rule it out.
5069 if (V1.empty() || V2.empty() ||
5070 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
5071 MaxV2Elt == static_cast<int>(V2.size()) - 1))
5072 return false;
5073
5074 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
5075 // shuffle of another shuffle, or not a shuffle (that is treated like a
5076 // identity shuffle).
5077 auto GetBaseMaskValue = [&](Instruction *I, int M) {
5078 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5079 if (!SV)
5080 return M;
5081 if (isa<UndefValue>(SV->getOperand(1)))
5082 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5083 if (InputShuffles.contains(SSV))
5084 return SSV->getMaskValue(SV->getMaskValue(M));
5085 return SV->getMaskValue(M);
5086 };
5087
5088 // Attempt to sort the inputs my ascending mask values to make simpler input
5089 // shuffles and push complex shuffles down to the uses. We sort on the first
5090 // of the two input shuffle orders, to try and get at least one input into a
5091 // nice order.
5092 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
5093 std::pair<int, int> Y) {
5094 int MXA = GetBaseMaskValue(A, X.first);
5095 int MYA = GetBaseMaskValue(A, Y.first);
5096 return MXA < MYA;
5097 };
5098 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
5099 return SortBase(SVI0A, A, B);
5100 });
5101 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
5102 return SortBase(SVI1A, A, B);
5103 });
5104 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5105 // modified order of the input shuffles.
5106 SmallVector<SmallVector<int>> ReconstructMasks;
5107 for (const auto &Mask : OrigReconstructMasks) {
5108 SmallVector<int> ReconstructMask;
5109 for (int M : Mask) {
5110 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5111 auto It = find_if(V, [M](auto A) { return A.second == M; });
5112 assert(It != V.end() && "Expected all entries in Mask");
5113 return std::distance(V.begin(), It);
5114 };
5115 if (M < 0)
5116 ReconstructMask.push_back(-1);
5117 else if (M < static_cast<int>(NumElts)) {
5118 ReconstructMask.push_back(FindIndex(V1, M));
5119 } else {
5120 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5121 }
5122 }
5123 ReconstructMasks.push_back(std::move(ReconstructMask));
5124 }
5125
5126 // Calculate the masks needed for the new input shuffles, which get padded
5127 // with undef
5128 SmallVector<int> V1A, V1B, V2A, V2B;
5129 for (unsigned I = 0; I < V1.size(); I++) {
5130 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5131 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5132 }
5133 for (unsigned I = 0; I < V2.size(); I++) {
5134 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5135 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5136 }
5137 while (V1A.size() < NumElts) {
5140 }
5141 while (V2A.size() < NumElts) {
5144 }
5145
5146 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5147 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5148 if (!SV)
5149 return C;
5150 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5153 VT, VT, SV->getShuffleMask(), CostKind);
5154 };
5155 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5156 return C +
5158 };
5159
5160 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5161 unsigned MaxVectorSize =
5163 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5164 if (MaxElementsInVector == 0)
5165 return false;
5166 // When there are multiple shufflevector operations on the same input,
5167 // especially when the vector length is larger than the register size,
5168 // identical shuffle patterns may occur across different groups of elements.
5169 // To avoid overestimating the cost by counting these repeated shuffles more
5170 // than once, we only account for unique shuffle patterns. This adjustment
5171 // prevents inflated costs in the cost model for wide vectors split into
5172 // several register-sized groups.
5173 std::set<SmallVector<int, 4>> UniqueShuffles;
5174 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5175 // Compute the cost for performing the shuffle over the full vector.
5176 auto ShuffleCost =
5178 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5179 if (NumFullVectors < 2)
5180 return C + ShuffleCost;
5181 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5182 unsigned NumUniqueGroups = 0;
5183 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5184 // For each group of MaxElementsInVector contiguous elements,
5185 // collect their shuffle pattern and insert into the set of unique patterns.
5186 for (unsigned I = 0; I < NumFullVectors; ++I) {
5187 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5188 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5189 if (UniqueShuffles.insert(SubShuffle).second)
5190 NumUniqueGroups += 1;
5191 }
5192 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5193 };
5194 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5195 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5196 if (!SV)
5197 return C;
5198 SmallVector<int, 16> Mask;
5199 SV->getShuffleMask(Mask);
5200 return AddShuffleMaskAdjustedCost(C, Mask);
5201 };
5202 // Check that input consists of ShuffleVectors applied to the same input
5203 auto AllShufflesHaveSameOperands =
5204 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5205 if (InputShuffles.size() < 2)
5206 return false;
5207 ShuffleVectorInst *FirstSV =
5208 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5209 if (!FirstSV)
5210 return false;
5211
5212 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5213 return std::all_of(
5214 std::next(InputShuffles.begin()), InputShuffles.end(),
5215 [&](Instruction *I) {
5216 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5217 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5218 });
5219 };
5220
5221 // Get the costs of the shuffles + binops before and after with the new
5222 // shuffle masks.
5223 InstructionCost CostBefore =
5224 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5225 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5226 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5227 InstructionCost(0), AddShuffleCost);
5228 if (AllShufflesHaveSameOperands(InputShuffles)) {
5229 UniqueShuffles.clear();
5230 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5231 InstructionCost(0), AddShuffleAdjustedCost);
5232 } else {
5233 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5234 InstructionCost(0), AddShuffleCost);
5235 }
5236
5237 // The new binops will be unused for lanes past the used shuffle lengths.
5238 // These types attempt to get the correct cost for that from the target.
5239 FixedVectorType *Op0SmallVT =
5240 FixedVectorType::get(VT->getScalarType(), V1.size());
5241 FixedVectorType *Op1SmallVT =
5242 FixedVectorType::get(VT->getScalarType(), V2.size());
5243 InstructionCost CostAfter =
5244 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5245 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5246 UniqueShuffles.clear();
5247 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5248 InstructionCost(0), AddShuffleMaskAdjustedCost);
5249 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5250 CostAfter +=
5251 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5252 InstructionCost(0), AddShuffleMaskCost);
5253
5254 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5255 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5256 << " vs CostAfter: " << CostAfter << "\n");
5257 if (CostBefore < CostAfter ||
5258 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5259 return false;
5260
5261 // The cost model has passed, create the new instructions.
5262 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5263 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5264 if (!SV)
5265 return I;
5266 if (isa<UndefValue>(SV->getOperand(1)))
5267 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5268 if (InputShuffles.contains(SSV))
5269 return SSV->getOperand(Op);
5270 return SV->getOperand(Op);
5271 };
5272 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5273 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5274 GetShuffleOperand(SVI0A, 1), V1A);
5275 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5276 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5277 GetShuffleOperand(SVI0B, 1), V1B);
5278 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5279 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5280 GetShuffleOperand(SVI1A, 1), V2A);
5281 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5282 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5283 GetShuffleOperand(SVI1B, 1), V2B);
5284 Builder.SetInsertPoint(Op0);
5285 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5286 NSV0A, NSV0B);
5287 if (auto *I = dyn_cast<Instruction>(NOp0))
5288 I->copyIRFlags(Op0, true);
5289 Builder.SetInsertPoint(Op1);
5290 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5291 NSV1A, NSV1B);
5292 if (auto *I = dyn_cast<Instruction>(NOp1))
5293 I->copyIRFlags(Op1, true);
5294
5295 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5296 Builder.SetInsertPoint(Shuffles[S]);
5297 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5298 replaceValue(*Shuffles[S], *NSV, false);
5299 }
5300
5301 Worklist.pushValue(NSV0A);
5302 Worklist.pushValue(NSV0B);
5303 Worklist.pushValue(NSV1A);
5304 Worklist.pushValue(NSV1B);
5305 return true;
5306}
5307
5308/// Check if instruction depends on ZExt and this ZExt can be moved after the
5309/// instruction. Move ZExt if it is profitable. For example:
5310/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5311/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5312/// Cost model calculations takes into account if zext(x) has other users and
5313/// whether it can be propagated through them too.
5314bool VectorCombine::shrinkType(Instruction &I) {
5315 Value *ZExted, *OtherOperand;
5316 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5317 m_Value(OtherOperand))) &&
5318 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5319 return false;
5320
5321 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5322
5323 auto *BigTy = cast<FixedVectorType>(I.getType());
5324 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5325 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5326
5327 if (I.getOpcode() == Instruction::LShr) {
5328 // Check that the shift amount is less than the number of bits in the
5329 // smaller type. Otherwise, the smaller lshr will return a poison value.
5330 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5331 if (ShAmtKB.getMaxValue().uge(BW))
5332 return false;
5333 } else {
5334 // Check that the expression overall uses at most the same number of bits as
5335 // ZExted
5336 KnownBits KB = computeKnownBits(&I, *DL);
5337 if (KB.countMaxActiveBits() > BW)
5338 return false;
5339 }
5340
5341 // Calculate costs of leaving current IR as it is and moving ZExt operation
5342 // later, along with adding truncates if needed
5344 Instruction::ZExt, BigTy, SmallTy,
5345 TargetTransformInfo::CastContextHint::None, CostKind);
5346 InstructionCost CurrentCost = ZExtCost;
5347 InstructionCost ShrinkCost = 0;
5348
5349 // Calculate total cost and check that we can propagate through all ZExt users
5350 for (User *U : ZExtOperand->users()) {
5351 auto *UI = cast<Instruction>(U);
5352 if (UI == &I) {
5353 CurrentCost +=
5354 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5355 ShrinkCost +=
5356 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5357 ShrinkCost += ZExtCost;
5358 continue;
5359 }
5360
5361 if (!Instruction::isBinaryOp(UI->getOpcode()))
5362 return false;
5363
5364 // Check if we can propagate ZExt through its other users
5365 KnownBits KB = computeKnownBits(UI, *DL);
5366 if (KB.countMaxActiveBits() > BW)
5367 return false;
5368
5369 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5370 ShrinkCost +=
5371 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5372 ShrinkCost += ZExtCost;
5373 }
5374
5375 // If the other instruction operand is not a constant, we'll need to
5376 // generate a truncate instruction. So we have to adjust cost
5377 if (!isa<Constant>(OtherOperand))
5378 ShrinkCost += TTI.getCastInstrCost(
5379 Instruction::Trunc, SmallTy, BigTy,
5380 TargetTransformInfo::CastContextHint::None, CostKind);
5381
5382 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5383 // towards modifying the IR because shrinking opens opportunities for other
5384 // shrinking optimisations.
5385 if (ShrinkCost > CurrentCost)
5386 return false;
5387
5388 Builder.SetInsertPoint(&I);
5389 Value *Op0 = ZExted;
5390 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5391 // Keep the order of operands the same
5392 if (I.getOperand(0) == OtherOperand)
5393 std::swap(Op0, Op1);
5394 Value *NewBinOp =
5395 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5396 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5397 cast<Instruction>(NewBinOp)->copyMetadata(I);
5398 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5399 replaceValue(I, *NewZExtr);
5400 return true;
5401}
5402
5403/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5404/// shuffle (DstVec, SrcVec, Mask)
5405bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5406 Value *DstVec, *SrcVec;
5407 uint64_t ExtIdx, InsIdx;
5408 if (!match(&I,
5409 m_InsertElt(m_Value(DstVec),
5410 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5411 m_ConstantInt(InsIdx))))
5412 return false;
5413
5414 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5415 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5416 // We can try combining vectors with different element sizes.
5417 if (!DstVecTy || !SrcVecTy ||
5418 SrcVecTy->getElementType() != DstVecTy->getElementType())
5419 return false;
5420
5421 unsigned NumDstElts = DstVecTy->getNumElements();
5422 unsigned NumSrcElts = SrcVecTy->getNumElements();
5423 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5424 return false;
5425
5426 // Insertion into poison is a cheaper single operand shuffle.
5428 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5429
5430 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5431 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5432 if (NeedDstSrcSwap) {
5434 Mask[InsIdx] = ExtIdx % NumDstElts;
5435 std::swap(DstVec, SrcVec);
5436 } else {
5438 std::iota(Mask.begin(), Mask.end(), 0);
5439 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5440 }
5441
5442 // Cost
5443 auto *Ins = cast<InsertElementInst>(&I);
5444 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5445 InstructionCost InsCost =
5446 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5447 InstructionCost ExtCost =
5448 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5449 InstructionCost OldCost = ExtCost + InsCost;
5450
5451 InstructionCost NewCost = 0;
5452 SmallVector<int> ExtToVecMask;
5453 if (!NeedExpOrNarrow) {
5454 // Ignore 'free' identity insertion shuffle.
5455 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5456 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5457 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5458 nullptr, {DstVec, SrcVec});
5459 } else {
5460 // When creating a length-changing-vector, always try to keep the relevant
5461 // element in an equivalent position, so that bulk shuffles are more likely
5462 // to be useful.
5463 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5464 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5465 // Add cost for expanding or narrowing
5467 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5468 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5469 }
5470
5471 if (!Ext->hasOneUse())
5472 NewCost += ExtCost;
5473
5474 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5475 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5476 << "\n");
5477
5478 if (OldCost < NewCost)
5479 return false;
5480
5481 if (NeedExpOrNarrow) {
5482 if (!NeedDstSrcSwap)
5483 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5484 else
5485 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5486 }
5487
5488 // Canonicalize undef param to RHS to help further folds.
5489 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5490 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5491 std::swap(DstVec, SrcVec);
5492 }
5493
5494 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5495 replaceValue(I, *Shuf);
5496
5497 return true;
5498}
5499
5500/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5501/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5502/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5503/// before casting it back into `<vscale x 16 x i32>`.
5504bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5505 const APInt *SplatVal0, *SplatVal1;
5507 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5508 return false;
5509
5510 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5511 << "\n");
5512
5513 auto *VTy =
5514 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5515 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5516 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5517
5518 // Just in case the cost of interleave2 intrinsic and bitcast are both
5519 // invalid, in which case we want to bail out, we use <= rather
5520 // than < here. Even they both have valid and equal costs, it's probably
5521 // not a good idea to emit a high-cost constant splat.
5523 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5525 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5526 << *I.getType() << " is too high.\n");
5527 return false;
5528 }
5529
5530 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5531 NewSplatVal <<= Width;
5532 NewSplatVal |= SplatVal0->zext(Width * 2);
5533 auto *NewSplat = ConstantVector::getSplat(
5534 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5535
5536 IRBuilder<> Builder(&I);
5537 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5538 return true;
5539}
5540
5541// Attempt to shrink loads that are only used by shufflevector instructions.
5542bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
5543 auto *OldLoad = dyn_cast<LoadInst>(&I);
5544 if (!OldLoad || !OldLoad->isSimple())
5545 return false;
5546
5547 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
5548 if (!OldLoadTy)
5549 return false;
5550
5551 unsigned const OldNumElements = OldLoadTy->getNumElements();
5552
5553 // Search all uses of load. If all uses are shufflevector instructions, and
5554 // the second operands are all poison values, find the minimum and maximum
5555 // indices of the vector elements referenced by all shuffle masks.
5556 // Otherwise return `std::nullopt`.
5557 using IndexRange = std::pair<int, int>;
5558 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
5559 IndexRange OutputRange = IndexRange(OldNumElements, -1);
5560 for (llvm::Use &Use : I.uses()) {
5561 // Ensure all uses match the required pattern.
5562 User *Shuffle = Use.getUser();
5563 ArrayRef<int> Mask;
5564
5565 if (!match(Shuffle,
5566 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
5567 return std::nullopt;
5568
5569 // Ignore shufflevector instructions that have no uses.
5570 if (Shuffle->use_empty())
5571 continue;
5572
5573 // Find the min and max indices used by the shufflevector instruction.
5574 for (int Index : Mask) {
5575 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
5576 OutputRange.first = std::min(Index, OutputRange.first);
5577 OutputRange.second = std::max(Index, OutputRange.second);
5578 }
5579 }
5580 }
5581
5582 if (OutputRange.second < OutputRange.first)
5583 return std::nullopt;
5584
5585 return OutputRange;
5586 };
5587
5588 // Get the range of vector elements used by shufflevector instructions.
5589 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
5590 unsigned const NewNumElements = Indices->second + 1u;
5591
5592 // If the range of vector elements is smaller than the full load, attempt
5593 // to create a smaller load.
5594 if (NewNumElements < OldNumElements) {
5595 IRBuilder Builder(&I);
5596 Builder.SetCurrentDebugLocation(I.getDebugLoc());
5597
5598 // Calculate costs of old and new ops.
5599 Type *ElemTy = OldLoadTy->getElementType();
5600 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
5601 Value *PtrOp = OldLoad->getPointerOperand();
5602
5604 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
5605 OldLoad->getPointerAddressSpace(), CostKind);
5606 InstructionCost NewCost =
5607 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
5608 OldLoad->getPointerAddressSpace(), CostKind);
5609
5610 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
5612 unsigned const MaxIndex = NewNumElements * 2u;
5613
5614 for (llvm::Use &Use : I.uses()) {
5615 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
5616
5617 // Ignore shufflevector instructions that have no uses.
5618 if (Shuffle->use_empty())
5619 continue;
5620
5621 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
5622
5623 // Create entry for new use.
5624 NewUses.push_back({Shuffle, OldMask});
5625
5626 // Validate mask indices.
5627 for (int Index : OldMask) {
5628 if (Index >= static_cast<int>(MaxIndex))
5629 return false;
5630 }
5631
5632 // Update costs.
5633 OldCost +=
5635 OldLoadTy, OldMask, CostKind);
5636 NewCost +=
5638 NewLoadTy, OldMask, CostKind);
5639 }
5640
5641 LLVM_DEBUG(
5642 dbgs() << "Found a load used only by shufflevector instructions: "
5643 << I << "\n OldCost: " << OldCost
5644 << " vs NewCost: " << NewCost << "\n");
5645
5646 if (OldCost < NewCost || !NewCost.isValid())
5647 return false;
5648
5649 // Create new load of smaller vector.
5650 auto *NewLoad = cast<LoadInst>(
5651 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
5652 NewLoad->copyMetadata(I);
5653
5654 // Replace all uses.
5655 for (UseEntry &Use : NewUses) {
5656 ShuffleVectorInst *Shuffle = Use.first;
5657 std::vector<int> &NewMask = Use.second;
5658
5659 Builder.SetInsertPoint(Shuffle);
5660 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
5661 Value *NewShuffle = Builder.CreateShuffleVector(
5662 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
5663
5664 replaceValue(*Shuffle, *NewShuffle, false);
5665 }
5666
5667 return true;
5668 }
5669 }
5670 return false;
5671}
5672
5673// Attempt to narrow a phi of shufflevector instructions where the two incoming
5674// values have the same operands but different masks. If the two shuffle masks
5675// are offsets of one another we can use one branch to rotate the incoming
5676// vector and perform one larger shuffle after the phi.
5677bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
5678 auto *Phi = dyn_cast<PHINode>(&I);
5679 if (!Phi || Phi->getNumIncomingValues() != 2u)
5680 return false;
5681
5682 Value *Op = nullptr;
5683 ArrayRef<int> Mask0;
5684 ArrayRef<int> Mask1;
5685
5686 if (!match(Phi->getOperand(0u),
5687 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
5688 !match(Phi->getOperand(1u),
5689 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
5690 return false;
5691
5692 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
5693
5694 // Ensure result vectors are wider than the argument vector.
5695 auto *InputVT = cast<FixedVectorType>(Op->getType());
5696 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
5697 auto const InputNumElements = InputVT->getNumElements();
5698
5699 if (InputNumElements >= ResultVT->getNumElements())
5700 return false;
5701
5702 // Take the difference of the two shuffle masks at each index. Ignore poison
5703 // values at the same index in both masks.
5704 SmallVector<int, 16> NewMask;
5705 NewMask.reserve(Mask0.size());
5706
5707 for (auto [M0, M1] : zip(Mask0, Mask1)) {
5708 if (M0 >= 0 && M1 >= 0)
5709 NewMask.push_back(M0 - M1);
5710 else if (M0 == -1 && M1 == -1)
5711 continue;
5712 else
5713 return false;
5714 }
5715
5716 // Ensure all elements of the new mask are equal. If the difference between
5717 // the incoming mask elements is the same, the two must be constant offsets
5718 // of one another.
5719 if (NewMask.empty() || !all_equal(NewMask))
5720 return false;
5721
5722 // Create new mask using difference of the two incoming masks.
5723 int MaskOffset = NewMask[0u];
5724 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
5725 NewMask.clear();
5726
5727 for (unsigned I = 0u; I < InputNumElements; ++I) {
5728 NewMask.push_back(Index);
5729 Index = (Index + 1u) % InputNumElements;
5730 }
5731
5732 // Calculate costs for worst cases and compare.
5733 auto const Kind = TTI::SK_PermuteSingleSrc;
5734 auto OldCost =
5735 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
5736 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
5737 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
5738 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
5739
5740 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
5741 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5742 << "\n");
5743
5744 if (NewCost > OldCost)
5745 return false;
5746
5747 // Create new shuffles and narrowed phi.
5748 auto Builder = IRBuilder(Shuf);
5749 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
5750 auto *PoisonVal = PoisonValue::get(InputVT);
5751 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
5752 Worklist.push(cast<Instruction>(NewShuf0));
5753
5754 Builder.SetInsertPoint(Phi);
5755 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
5756 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
5757 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
5758 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
5759
5760 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
5761 PoisonVal = PoisonValue::get(NewPhi->getType());
5762 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
5763
5764 replaceValue(*Phi, *NewShuf1);
5765 return true;
5766}
5767
5768/// This is the entry point for all transforms. Pass manager differences are
5769/// handled in the callers of this function.
5770bool VectorCombine::run() {
5772 return false;
5773
5774 // Don't attempt vectorization if the target does not support vectors.
5775 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
5776 return false;
5777
5778 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
5779
5780 auto FoldInst = [this](Instruction &I) {
5781 Builder.SetInsertPoint(&I);
5782 bool IsVectorType = isa<VectorType>(I.getType());
5783 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
5784 auto Opcode = I.getOpcode();
5785
5786 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
5787
5788 // These folds should be beneficial regardless of when this pass is run
5789 // in the optimization pipeline.
5790 // The type checking is for run-time efficiency. We can avoid wasting time
5791 // dispatching to folding functions if there's no chance of matching.
5792 if (IsFixedVectorType) {
5793 switch (Opcode) {
5794 case Instruction::InsertElement:
5795 if (vectorizeLoadInsert(I))
5796 return true;
5797 break;
5798 case Instruction::ShuffleVector:
5799 if (widenSubvectorLoad(I))
5800 return true;
5801 break;
5802 default:
5803 break;
5804 }
5805 }
5806
5807 // This transform works with scalable and fixed vectors
5808 // TODO: Identify and allow other scalable transforms
5809 if (IsVectorType) {
5810 if (scalarizeOpOrCmp(I))
5811 return true;
5812 if (scalarizeLoad(I))
5813 return true;
5814 if (scalarizeExtExtract(I))
5815 return true;
5816 if (scalarizeVPIntrinsic(I))
5817 return true;
5818 if (foldInterleaveIntrinsics(I))
5819 return true;
5820 }
5821
5822 if (Opcode == Instruction::Store)
5823 if (foldSingleElementStore(I))
5824 return true;
5825
5826 // If this is an early pipeline invocation of this pass, we are done.
5827 if (TryEarlyFoldsOnly)
5828 return false;
5829
5830 // Otherwise, try folds that improve codegen but may interfere with
5831 // early IR canonicalizations.
5832 // The type checking is for run-time efficiency. We can avoid wasting time
5833 // dispatching to folding functions if there's no chance of matching.
5834 if (IsFixedVectorType) {
5835 switch (Opcode) {
5836 case Instruction::InsertElement:
5837 if (foldInsExtFNeg(I))
5838 return true;
5839 if (foldInsExtBinop(I))
5840 return true;
5841 if (foldInsExtVectorToShuffle(I))
5842 return true;
5843 break;
5844 case Instruction::ShuffleVector:
5845 if (foldPermuteOfBinops(I))
5846 return true;
5847 if (foldShuffleOfBinops(I))
5848 return true;
5849 if (foldShuffleOfSelects(I))
5850 return true;
5851 if (foldShuffleOfCastops(I))
5852 return true;
5853 if (foldShuffleOfShuffles(I))
5854 return true;
5855 if (foldPermuteOfIntrinsic(I))
5856 return true;
5857 if (foldShufflesOfLengthChangingShuffles(I))
5858 return true;
5859 if (foldShuffleOfIntrinsics(I))
5860 return true;
5861 if (foldSelectShuffle(I))
5862 return true;
5863 if (foldShuffleToIdentity(I))
5864 return true;
5865 break;
5866 case Instruction::Load:
5867 if (shrinkLoadForShuffles(I))
5868 return true;
5869 break;
5870 case Instruction::BitCast:
5871 if (foldBitcastShuffle(I))
5872 return true;
5873 if (foldSelectsFromBitcast(I))
5874 return true;
5875 break;
5876 case Instruction::And:
5877 case Instruction::Or:
5878 case Instruction::Xor:
5879 if (foldBitOpOfCastops(I))
5880 return true;
5881 if (foldBitOpOfCastConstant(I))
5882 return true;
5883 break;
5884 case Instruction::PHI:
5885 if (shrinkPhiOfShuffles(I))
5886 return true;
5887 break;
5888 default:
5889 if (shrinkType(I))
5890 return true;
5891 break;
5892 }
5893 } else {
5894 switch (Opcode) {
5895 case Instruction::Call:
5896 if (foldShuffleFromReductions(I))
5897 return true;
5898 if (foldCastFromReductions(I))
5899 return true;
5900 break;
5901 case Instruction::ExtractElement:
5902 if (foldShuffleChainsToReduce(I))
5903 return true;
5904 break;
5905 case Instruction::ICmp:
5906 if (foldSignBitReductionCmp(I))
5907 return true;
5908 if (foldICmpEqZeroVectorReduce(I))
5909 return true;
5910 if (foldEquivalentReductionCmp(I))
5911 return true;
5912 [[fallthrough]];
5913 case Instruction::FCmp:
5914 if (foldExtractExtract(I))
5915 return true;
5916 break;
5917 case Instruction::Or:
5918 if (foldConcatOfBoolMasks(I))
5919 return true;
5920 [[fallthrough]];
5921 default:
5922 if (Instruction::isBinaryOp(Opcode)) {
5923 if (foldExtractExtract(I))
5924 return true;
5925 if (foldExtractedCmps(I))
5926 return true;
5927 if (foldBinopOfReductions(I))
5928 return true;
5929 }
5930 break;
5931 }
5932 }
5933 return false;
5934 };
5935
5936 bool MadeChange = false;
5937 for (BasicBlock &BB : F) {
5938 // Ignore unreachable basic blocks.
5939 if (!DT.isReachableFromEntry(&BB))
5940 continue;
5941 // Use early increment range so that we can erase instructions in loop.
5942 // make_early_inc_range is not applicable here, as the next iterator may
5943 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5944 // We manually maintain the next instruction and update it when it is about
5945 // to be deleted.
5946 Instruction *I = &BB.front();
5947 while (I) {
5948 NextInst = I->getNextNode();
5949 if (!I->isDebugOrPseudoInst())
5950 MadeChange |= FoldInst(*I);
5951 I = NextInst;
5952 }
5953 }
5954
5955 NextInst = nullptr;
5956
5957 while (!Worklist.isEmpty()) {
5958 Instruction *I = Worklist.removeOne();
5959 if (!I)
5960 continue;
5961
5964 continue;
5965 }
5966
5967 MadeChange |= FoldInst(*I);
5968 }
5969
5970 return MadeChange;
5971}
5972
5975 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5977 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5978 AAResults &AA = FAM.getResult<AAManager>(F);
5979 const DataLayout *DL = &F.getDataLayout();
5980 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5981 TryEarlyFoldsOnly);
5982 if (!Combiner.run())
5983 return PreservedAnalyses::all();
5986 return PA;
5987}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1448
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Value * generateNewInstTree(ArrayRef< InstLane > Item, Use *From, FixedVectorType *Ty, const DenseSet< std::pair< Value *, Use * > > &IdentityLeafs, const DenseSet< std::pair< Value *, Use * > > &SplatLeafs, const DenseSet< std::pair< Value *, Use * > > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
std::pair< Value *, int > InstLane
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, const SimplifyQuery &SQ)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static InstLane lookThroughShuffles(Value *V, int Lane)
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1054
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1638
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:986
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:33
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2620
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2608
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1930
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2686
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1553
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2269
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2710
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2011
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2294
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:529
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2501
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2532
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2705
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2235
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1913
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1532
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2113
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2642
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1591
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1926
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2099
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:629
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1748
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1851
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2477
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1613
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:737
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2277
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2282
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_and< Ty... > m_CombineAnd(const Ty &...Ps)
Combine pattern matchers matching all of Ps patterns.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_Constant()
Match an arbitrary Constant and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
auto m_AnyIntrinsic()
Matches any intrinsic call and ignore it.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:557
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2115
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:431
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
const Instruction * CxtI
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC