LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
39#include <numeric>
40#include <optional>
41#include <queue>
42#include <set>
43
44#define DEBUG_TYPE "vector-combine"
46
47using namespace llvm;
48using namespace llvm::PatternMatch;
49
50STATISTIC(NumVecLoad, "Number of vector loads formed");
51STATISTIC(NumVecCmp, "Number of vector compares formed");
52STATISTIC(NumVecBO, "Number of vector binops formed");
53STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
54STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
55STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
56STATISTIC(NumScalarCmp, "Number of scalar compares formed");
57STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
58
60 "disable-vector-combine", cl::init(false), cl::Hidden,
61 cl::desc("Disable all vector combine transforms"));
62
64 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
65 cl::desc("Disable binop extract to shuffle transforms"));
66
68 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
69 cl::desc("Max number of instructions to scan for vector combining."));
70
71static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
72
73namespace {
74class VectorCombine {
75public:
76 VectorCombine(Function &F, const TargetTransformInfo &TTI,
79 bool TryEarlyFoldsOnly)
80 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
81 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
82 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
83
84 bool run();
85
86private:
87 Function &F;
89 const TargetTransformInfo &TTI;
90 const DominatorTree &DT;
91 AAResults &AA;
92 AssumptionCache &AC;
93 const DataLayout *DL;
94 TTI::TargetCostKind CostKind;
95 const SimplifyQuery SQ;
96
97 /// If true, only perform beneficial early IR transforms. Do not introduce new
98 /// vector operations.
99 bool TryEarlyFoldsOnly;
100
101 InstructionWorklist Worklist;
102
103 /// Next instruction to iterate. It will be updated when it is erased by
104 /// RecursivelyDeleteTriviallyDeadInstructions.
105 Instruction *NextInst;
106
107 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
108 // parameter. That should be updated to specific sub-classes because the
109 // run loop was changed to dispatch on opcode.
110 bool vectorizeLoadInsert(Instruction &I);
111 bool widenSubvectorLoad(Instruction &I);
112 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
113 ExtractElementInst *Ext1,
114 unsigned PreferredExtractIndex) const;
115 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
116 const Instruction &I,
117 ExtractElementInst *&ConvertToShuffle,
118 unsigned PreferredExtractIndex);
119 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 bool foldExtractExtract(Instruction &I);
122 bool foldInsExtFNeg(Instruction &I);
123 bool foldInsExtBinop(Instruction &I);
124 bool foldInsExtVectorToShuffle(Instruction &I);
125 bool foldBitOpOfCastops(Instruction &I);
126 bool foldBitOpOfCastConstant(Instruction &I);
127 bool foldBitcastShuffle(Instruction &I);
128 bool scalarizeOpOrCmp(Instruction &I);
129 bool scalarizeVPIntrinsic(Instruction &I);
130 bool foldExtractedCmps(Instruction &I);
131 bool foldSelectsFromBitcast(Instruction &I);
132 bool foldBinopOfReductions(Instruction &I);
133 bool foldSingleElementStore(Instruction &I);
134 bool scalarizeLoad(Instruction &I);
135 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
136 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeExtExtract(Instruction &I);
138 bool foldConcatOfBoolMasks(Instruction &I);
139 bool foldPermuteOfBinops(Instruction &I);
140 bool foldShuffleOfBinops(Instruction &I);
141 bool foldShuffleOfSelects(Instruction &I);
142 bool foldShuffleOfCastops(Instruction &I);
143 bool foldShuffleOfShuffles(Instruction &I);
144 bool foldPermuteOfIntrinsic(Instruction &I);
145 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
146 bool foldShuffleOfIntrinsics(Instruction &I);
147 bool foldShuffleToIdentity(Instruction &I);
148 bool foldShuffleFromReductions(Instruction &I);
149 bool foldShuffleChainsToReduce(Instruction &I);
150 bool foldCastFromReductions(Instruction &I);
151 bool foldSignBitReductionCmp(Instruction &I);
152 bool foldICmpEqZeroVectorReduce(Instruction &I);
153 bool foldEquivalentReductionCmp(Instruction &I);
154 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
155 bool foldInterleaveIntrinsics(Instruction &I);
156 bool shrinkType(Instruction &I);
157 bool shrinkLoadForShuffles(Instruction &I);
158 bool shrinkPhiOfShuffles(Instruction &I);
159
160 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
161 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
162 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
163 Old.replaceAllUsesWith(&New);
164 if (auto *NewI = dyn_cast<Instruction>(&New)) {
165 New.takeName(&Old);
166 Worklist.pushUsersToWorkList(*NewI);
167 Worklist.pushValue(NewI);
168 }
169 if (Erase && isInstructionTriviallyDead(&Old)) {
170 eraseInstruction(Old);
171 } else {
172 Worklist.push(&Old);
173 }
174 }
175
176 void eraseInstruction(Instruction &I) {
177 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
178 SmallVector<Value *> Ops(I.operands());
179 Worklist.remove(&I);
180 I.eraseFromParent();
181
182 // Push remaining users of the operands and then the operand itself - allows
183 // further folds that were hindered by OneUse limits.
184 SmallPtrSet<Value *, 4> Visited;
185 for (Value *Op : Ops) {
186 if (!Visited.contains(Op)) {
187 if (auto *OpI = dyn_cast<Instruction>(Op)) {
189 OpI, nullptr, nullptr, [&](Value *V) {
190 if (auto *I = dyn_cast<Instruction>(V)) {
191 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
192 Worklist.remove(I);
193 if (I == NextInst)
194 NextInst = NextInst->getNextNode();
195 Visited.insert(I);
196 }
197 }))
198 continue;
199 Worklist.pushUsersToWorkList(*OpI);
200 Worklist.pushValue(OpI);
201 }
202 }
203 }
204 }
205};
206} // namespace
207
208/// Return the source operand of a potentially bitcasted value. If there is no
209/// bitcast, return the input value itself.
211 while (auto *BitCast = dyn_cast<BitCastInst>(V))
212 V = BitCast->getOperand(0);
213 return V;
214}
215
216static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
217 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
218 // The widened load may load data from dirty regions or create data races
219 // non-existent in the source.
220 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
221 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
223 return false;
224
225 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
226 // sure we have all of our type-based constraints in place for this target.
227 Type *ScalarTy = Load->getType()->getScalarType();
228 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
229 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
230 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
231 ScalarSize % 8 != 0)
232 return false;
233
234 return true;
235}
236
237bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
238 // Match insert into fixed vector of scalar value.
239 // TODO: Handle non-zero insert index.
240 Value *Scalar;
241 if (!match(&I,
243 return false;
244
245 // Optionally match an extract from another vector.
246 Value *X;
247 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
248 if (!HasExtract)
249 X = Scalar;
250
251 auto *Load = dyn_cast<LoadInst>(X);
252 if (!canWidenLoad(Load, TTI))
253 return false;
254
255 Type *ScalarTy = Scalar->getType();
256 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
257 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
258
259 // Check safety of replacing the scalar load with a larger vector load.
260 // We use minimal alignment (maximum flexibility) because we only care about
261 // the dereferenceable region. When calculating cost and creating a new op,
262 // we may use a larger value based on alignment attributes.
263 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
264 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
265
266 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
267 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
268 unsigned OffsetEltIndex = 0;
269 Align Alignment = Load->getAlign();
270 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
271 &DT)) {
272 // It is not safe to load directly from the pointer, but we can still peek
273 // through gep offsets and check if it safe to load from a base address with
274 // updated alignment. If it is, we can shuffle the element(s) into place
275 // after loading.
276 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
277 APInt Offset(OffsetBitWidth, 0);
279
280 // We want to shuffle the result down from a high element of a vector, so
281 // the offset must be positive.
282 if (Offset.isNegative())
283 return false;
284
285 // The offset must be a multiple of the scalar element to shuffle cleanly
286 // in the element's size.
287 uint64_t ScalarSizeInBytes = ScalarSize / 8;
288 if (Offset.urem(ScalarSizeInBytes) != 0)
289 return false;
290
291 // If we load MinVecNumElts, will our target element still be loaded?
292 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
293 if (OffsetEltIndex >= MinVecNumElts)
294 return false;
295
296 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
297 &DT))
298 return false;
299
300 // Update alignment with offset value. Note that the offset could be negated
301 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
302 // negation does not change the result of the alignment calculation.
303 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
304 }
305
306 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
307 // Use the greater of the alignment on the load or its source pointer.
308 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
309 Type *LoadTy = Load->getType();
310 unsigned AS = Load->getPointerAddressSpace();
311 InstructionCost OldCost =
312 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
313 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
314 OldCost +=
315 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
316 /* Insert */ true, HasExtract, CostKind);
317
318 // New pattern: load VecPtr
319 InstructionCost NewCost =
320 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
321 // Optionally, we are shuffling the loaded vector element(s) into place.
322 // For the mask set everything but element 0 to undef to prevent poison from
323 // propagating from the extra loaded memory. This will also optionally
324 // shrink/grow the vector from the loaded size to the output size.
325 // We assume this operation has no cost in codegen if there was no offset.
326 // Note that we could use freeze to avoid poison problems, but then we might
327 // still need a shuffle to change the vector size.
328 auto *Ty = cast<FixedVectorType>(I.getType());
329 unsigned OutputNumElts = Ty->getNumElements();
330 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
331 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
332 Mask[0] = OffsetEltIndex;
333 if (OffsetEltIndex)
334 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
335 CostKind);
336
337 // We can aggressively convert to the vector form because the backend can
338 // invert this transform if it does not result in a performance win.
339 if (OldCost < NewCost || !NewCost.isValid())
340 return false;
341
342 // It is safe and potentially profitable to load a vector directly:
343 // inselt undef, load Scalar, 0 --> load VecPtr
344 IRBuilder<> Builder(Load);
345 Value *CastedPtr =
346 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
347 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
348 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
349
350 replaceValue(I, *VecLd);
351 ++NumVecLoad;
352 return true;
353}
354
355/// If we are loading a vector and then inserting it into a larger vector with
356/// undefined elements, try to load the larger vector and eliminate the insert.
357/// This removes a shuffle in IR and may allow combining of other loaded values.
358bool VectorCombine::widenSubvectorLoad(Instruction &I) {
359 // Match subvector insert of fixed vector.
360 auto *Shuf = cast<ShuffleVectorInst>(&I);
361 if (!Shuf->isIdentityWithPadding())
362 return false;
363
364 // Allow a non-canonical shuffle mask that is choosing elements from op1.
365 unsigned NumOpElts =
366 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
367 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
368 return M >= (int)(NumOpElts);
369 });
370
371 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
372 if (!canWidenLoad(Load, TTI))
373 return false;
374
375 // We use minimal alignment (maximum flexibility) because we only care about
376 // the dereferenceable region. When calculating cost and creating a new op,
377 // we may use a larger value based on alignment attributes.
378 auto *Ty = cast<FixedVectorType>(I.getType());
379 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
380 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
381 Align Alignment = Load->getAlign();
382 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
383 return false;
384
385 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
386 Type *LoadTy = Load->getType();
387 unsigned AS = Load->getPointerAddressSpace();
388
389 // Original pattern: insert_subvector (load PtrOp)
390 // This conservatively assumes that the cost of a subvector insert into an
391 // undef value is 0. We could add that cost if the cost model accurately
392 // reflects the real cost of that operation.
393 InstructionCost OldCost =
394 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
395
396 // New pattern: load PtrOp
397 InstructionCost NewCost =
398 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
399
400 // We can aggressively convert to the vector form because the backend can
401 // invert this transform if it does not result in a performance win.
402 if (OldCost < NewCost || !NewCost.isValid())
403 return false;
404
405 IRBuilder<> Builder(Load);
406 Value *CastedPtr =
407 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
408 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
409 replaceValue(I, *VecLd);
410 ++NumVecLoad;
411 return true;
412}
413
414/// Determine which, if any, of the inputs should be replaced by a shuffle
415/// followed by extract from a different index.
416ExtractElementInst *VectorCombine::getShuffleExtract(
417 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
418 unsigned PreferredExtractIndex = InvalidIndex) const {
419 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
420 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
421 assert(Index0C && Index1C && "Expected constant extract indexes");
422
423 unsigned Index0 = Index0C->getZExtValue();
424 unsigned Index1 = Index1C->getZExtValue();
425
426 // If the extract indexes are identical, no shuffle is needed.
427 if (Index0 == Index1)
428 return nullptr;
429
430 Type *VecTy = Ext0->getVectorOperand()->getType();
431 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
432 InstructionCost Cost0 =
433 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
434 InstructionCost Cost1 =
435 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
436
437 // If both costs are invalid no shuffle is needed
438 if (!Cost0.isValid() && !Cost1.isValid())
439 return nullptr;
440
441 // We are extracting from 2 different indexes, so one operand must be shuffled
442 // before performing a vector operation and/or extract. The more expensive
443 // extract will be replaced by a shuffle.
444 if (Cost0 > Cost1)
445 return Ext0;
446 if (Cost1 > Cost0)
447 return Ext1;
448
449 // If the costs are equal and there is a preferred extract index, shuffle the
450 // opposite operand.
451 if (PreferredExtractIndex == Index0)
452 return Ext1;
453 if (PreferredExtractIndex == Index1)
454 return Ext0;
455
456 // Otherwise, replace the extract with the higher index.
457 return Index0 > Index1 ? Ext0 : Ext1;
458}
459
460/// Compare the relative costs of 2 extracts followed by scalar operation vs.
461/// vector operation(s) followed by extract. Return true if the existing
462/// instructions are cheaper than a vector alternative. Otherwise, return false
463/// and if one of the extracts should be transformed to a shufflevector, set
464/// \p ConvertToShuffle to that extract instruction.
465bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
466 ExtractElementInst *Ext1,
467 const Instruction &I,
468 ExtractElementInst *&ConvertToShuffle,
469 unsigned PreferredExtractIndex) {
470 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
471 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
472 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
473
474 unsigned Opcode = I.getOpcode();
475 Value *Ext0Src = Ext0->getVectorOperand();
476 Value *Ext1Src = Ext1->getVectorOperand();
477 Type *ScalarTy = Ext0->getType();
478 auto *VecTy = cast<VectorType>(Ext0Src->getType());
479 InstructionCost ScalarOpCost, VectorOpCost;
480
481 // Get cost estimates for scalar and vector versions of the operation.
482 bool IsBinOp = Instruction::isBinaryOp(Opcode);
483 if (IsBinOp) {
484 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
485 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
486 } else {
487 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
488 "Expected a compare");
489 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
490 ScalarOpCost = TTI.getCmpSelInstrCost(
491 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
492 VectorOpCost = TTI.getCmpSelInstrCost(
493 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
494 }
495
496 // Get cost estimates for the extract elements. These costs will factor into
497 // both sequences.
498 unsigned Ext0Index = Ext0IndexC->getZExtValue();
499 unsigned Ext1Index = Ext1IndexC->getZExtValue();
500
501 InstructionCost Extract0Cost =
502 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
503 InstructionCost Extract1Cost =
504 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
505
506 // A more expensive extract will always be replaced by a splat shuffle.
507 // For example, if Ext0 is more expensive:
508 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
509 // extelt (opcode (splat V0, Ext0), V1), Ext1
510 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
511 // check the cost of creating a broadcast shuffle and shuffling both
512 // operands to element 0.
513 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
514 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
515 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
516
517 // Extra uses of the extracts mean that we include those costs in the
518 // vector total because those instructions will not be eliminated.
519 InstructionCost OldCost, NewCost;
520 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
521 // Handle a special case. If the 2 extracts are identical, adjust the
522 // formulas to account for that. The extra use charge allows for either the
523 // CSE'd pattern or an unoptimized form with identical values:
524 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
525 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
526 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
527 OldCost = CheapExtractCost + ScalarOpCost;
528 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
529 } else {
530 // Handle the general case. Each extract is actually a different value:
531 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
532 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
533 NewCost = VectorOpCost + CheapExtractCost +
534 !Ext0->hasOneUse() * Extract0Cost +
535 !Ext1->hasOneUse() * Extract1Cost;
536 }
537
538 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
539 if (ConvertToShuffle) {
540 if (IsBinOp && DisableBinopExtractShuffle)
541 return true;
542
543 // If we are extracting from 2 different indexes, then one operand must be
544 // shuffled before performing the vector operation. The shuffle mask is
545 // poison except for 1 lane that is being translated to the remaining
546 // extraction lane. Therefore, it is a splat shuffle. Ex:
547 // ShufMask = { poison, poison, 0, poison }
548 // TODO: The cost model has an option for a "broadcast" shuffle
549 // (splat-from-element-0), but no option for a more general splat.
550 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
551 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
553 ShuffleMask[BestInsIndex] = BestExtIndex;
555 VecTy, VecTy, ShuffleMask, CostKind, 0,
556 nullptr, {ConvertToShuffle});
557 } else {
559 VecTy, VecTy, {}, CostKind, 0, nullptr,
560 {ConvertToShuffle});
561 }
562 }
563
564 // Aggressively form a vector op if the cost is equal because the transform
565 // may enable further optimization.
566 // Codegen can reverse this transform (scalarize) if it was not profitable.
567 return OldCost < NewCost;
568}
569
570/// Create a shuffle that translates (shifts) 1 element from the input vector
571/// to a new element location.
572static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
573 unsigned NewIndex, IRBuilderBase &Builder) {
574 // The shuffle mask is poison except for 1 lane that is being translated
575 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
576 // ShufMask = { 2, poison, poison, poison }
577 auto *VecTy = cast<FixedVectorType>(Vec->getType());
578 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
579 ShufMask[NewIndex] = OldIndex;
580 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
581}
582
583/// Given an extract element instruction with constant index operand, shuffle
584/// the source vector (shift the scalar element) to a NewIndex for extraction.
585/// Return null if the input can be constant folded, so that we are not creating
586/// unnecessary instructions.
587static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
588 IRBuilderBase &Builder) {
589 // Shufflevectors can only be created for fixed-width vectors.
590 Value *X = ExtElt->getVectorOperand();
591 if (!isa<FixedVectorType>(X->getType()))
592 return nullptr;
593
594 // If the extract can be constant-folded, this code is unsimplified. Defer
595 // to other passes to handle that.
596 Value *C = ExtElt->getIndexOperand();
597 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
598 if (isa<Constant>(X))
599 return nullptr;
600
601 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
602 NewIndex, Builder);
603 return Shuf;
604}
605
606/// Try to reduce extract element costs by converting scalar compares to vector
607/// compares followed by extract.
608/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
609Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
610 Instruction &I) {
611 assert(isa<CmpInst>(&I) && "Expected a compare");
612
613 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
614 // --> extelt (cmp Pred V0, V1), ExtIndex
615 ++NumVecCmp;
616 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
617 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
618 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
619}
620
621/// Try to reduce extract element costs by converting scalar binops to vector
622/// binops followed by extract.
623/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
624Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
625 Instruction &I) {
626 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
627
628 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
629 // --> extelt (bo V0, V1), ExtIndex
630 ++NumVecBO;
631 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
632 V1, "foldExtExtBinop");
633
634 // All IR flags are safe to back-propagate because any potential poison
635 // created in unused vector elements is discarded by the extract.
636 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
637 VecBOInst->copyIRFlags(&I);
638
639 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
640}
641
642/// Match an instruction with extracted vector operands.
643bool VectorCombine::foldExtractExtract(Instruction &I) {
644 // It is not safe to transform things like div, urem, etc. because we may
645 // create undefined behavior when executing those on unknown vector elements.
647 return false;
648
649 Instruction *I0, *I1;
650 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
651 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
653 return false;
654
655 Value *V0, *V1;
656 uint64_t C0, C1;
657 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
658 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
659 V0->getType() != V1->getType())
660 return false;
661
662 // If the scalar value 'I' is going to be re-inserted into a vector, then try
663 // to create an extract to that same element. The extract/insert can be
664 // reduced to a "select shuffle".
665 // TODO: If we add a larger pattern match that starts from an insert, this
666 // probably becomes unnecessary.
667 auto *Ext0 = cast<ExtractElementInst>(I0);
668 auto *Ext1 = cast<ExtractElementInst>(I1);
669 uint64_t InsertIndex = InvalidIndex;
670 if (I.hasOneUse())
671 match(I.user_back(),
672 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
673
674 ExtractElementInst *ExtractToChange;
675 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
676 return false;
677
678 Value *ExtOp0 = Ext0->getVectorOperand();
679 Value *ExtOp1 = Ext1->getVectorOperand();
680
681 if (ExtractToChange) {
682 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
683 Value *NewExtOp =
684 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
685 if (!NewExtOp)
686 return false;
687 if (ExtractToChange == Ext0)
688 ExtOp0 = NewExtOp;
689 else
690 ExtOp1 = NewExtOp;
691 }
692
693 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
694 : Ext0->getIndexOperand();
695 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
696 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
697 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
698 Worklist.push(Ext0);
699 Worklist.push(Ext1);
700 replaceValue(I, *NewExt);
701 return true;
702}
703
704/// Try to replace an extract + scalar fneg + insert with a vector fneg +
705/// shuffle.
706bool VectorCombine::foldInsExtFNeg(Instruction &I) {
707 // Match an insert (op (extract)) pattern.
708 Value *DstVec;
709 uint64_t ExtIdx, InsIdx;
710 Instruction *FNeg;
711 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
712 m_ConstantInt(InsIdx))))
713 return false;
714
715 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
716 Value *SrcVec;
717 Instruction *Extract;
718 if (!match(FNeg, m_FNeg(m_CombineAnd(
719 m_Instruction(Extract),
720 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
721 return false;
722
723 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
724 auto *DstVecScalarTy = DstVecTy->getScalarType();
725 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
726 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
727 return false;
728
729 // Ignore if insert/extract index is out of bounds or destination vector has
730 // one element
731 unsigned NumDstElts = DstVecTy->getNumElements();
732 unsigned NumSrcElts = SrcVecTy->getNumElements();
733 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
734 return false;
735
736 // We are inserting the negated element into the same lane that we extracted
737 // from. This is equivalent to a select-shuffle that chooses all but the
738 // negated element from the destination vector.
739 SmallVector<int> Mask(NumDstElts);
740 std::iota(Mask.begin(), Mask.end(), 0);
741 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
742 InstructionCost OldCost =
743 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
744 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
745
746 // If the extract has one use, it will be eliminated, so count it in the
747 // original cost. If it has more than one use, ignore the cost because it will
748 // be the same before/after.
749 if (Extract->hasOneUse())
750 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
751
752 InstructionCost NewCost =
753 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
755 DstVecTy, Mask, CostKind);
756
757 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
758 // If the lengths of the two vectors are not equal,
759 // we need to add a length-change vector. Add this cost.
760 SmallVector<int> SrcMask;
761 if (NeedLenChg) {
762 SrcMask.assign(NumDstElts, PoisonMaskElem);
763 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
765 DstVecTy, SrcVecTy, SrcMask, CostKind);
766 }
767
768 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
769 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
770 << "\n");
771 if (NewCost > OldCost)
772 return false;
773
774 Value *NewShuf, *LenChgShuf = nullptr;
775 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
776 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
777 if (NeedLenChg) {
778 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
779 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
780 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
781 Worklist.pushValue(LenChgShuf);
782 } else {
783 // shuffle DstVec, (fneg SrcVec), Mask
784 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
785 }
786
787 Worklist.pushValue(VecFNeg);
788 replaceValue(I, *NewShuf);
789 return true;
790}
791
792/// Try to fold insert(binop(x,y),binop(a,b),idx)
793/// --> binop(insert(x,a,idx),insert(y,b,idx))
794bool VectorCombine::foldInsExtBinop(Instruction &I) {
795 BinaryOperator *VecBinOp, *SclBinOp;
796 uint64_t Index;
797 if (!match(&I,
798 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
799 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
800 return false;
801
802 // TODO: Add support for addlike etc.
803 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
804 if (BinOpcode != SclBinOp->getOpcode())
805 return false;
806
807 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
808 if (!ResultTy)
809 return false;
810
811 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
812 // shuffle?
813
815 TTI.getInstructionCost(VecBinOp, CostKind) +
817 InstructionCost NewCost =
818 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
819 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
820 Index, VecBinOp->getOperand(0),
821 SclBinOp->getOperand(0)) +
822 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
823 Index, VecBinOp->getOperand(1),
824 SclBinOp->getOperand(1));
825
826 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
827 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
828 << "\n");
829 if (NewCost > OldCost)
830 return false;
831
832 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
833 SclBinOp->getOperand(0), Index);
834 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
835 SclBinOp->getOperand(1), Index);
836 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
837
838 // Intersect flags from the old binops.
839 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
840 NewInst->copyIRFlags(VecBinOp);
841 NewInst->andIRFlags(SclBinOp);
842 }
843
844 Worklist.pushValue(NewIns0);
845 Worklist.pushValue(NewIns1);
846 replaceValue(I, *NewBO);
847 return true;
848}
849
850/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
851/// Supports: bitcast, trunc, sext, zext
852bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
853 // Check if this is a bitwise logic operation
854 auto *BinOp = dyn_cast<BinaryOperator>(&I);
855 if (!BinOp || !BinOp->isBitwiseLogicOp())
856 return false;
857
858 // Get the cast instructions
859 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
860 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
861 if (!LHSCast || !RHSCast) {
862 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
863 return false;
864 }
865
866 // Both casts must be the same type
867 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
868 if (CastOpcode != RHSCast->getOpcode())
869 return false;
870
871 // Only handle supported cast operations
872 switch (CastOpcode) {
873 case Instruction::BitCast:
874 case Instruction::Trunc:
875 case Instruction::SExt:
876 case Instruction::ZExt:
877 break;
878 default:
879 return false;
880 }
881
882 Value *LHSSrc = LHSCast->getOperand(0);
883 Value *RHSSrc = RHSCast->getOperand(0);
884
885 // Source types must match
886 if (LHSSrc->getType() != RHSSrc->getType())
887 return false;
888
889 auto *SrcTy = LHSSrc->getType();
890 auto *DstTy = I.getType();
891 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
892 // Other casts only handle vector types with integer elements.
893 if (CastOpcode != Instruction::BitCast &&
894 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
895 return false;
896
897 // Only integer scalar/vector values are legal for bitwise logic operations.
898 if (!SrcTy->getScalarType()->isIntegerTy() ||
899 !DstTy->getScalarType()->isIntegerTy())
900 return false;
901
902 // Cost Check :
903 // OldCost = bitlogic + 2*casts
904 // NewCost = bitlogic + cast
905
906 // Calculate specific costs for each cast with instruction context
908 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
910 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
911
912 InstructionCost OldCost =
913 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
914 LHSCastCost + RHSCastCost;
915
916 // For new cost, we can't provide an instruction (it doesn't exist yet)
917 InstructionCost GenericCastCost = TTI.getCastInstrCost(
918 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
919
920 InstructionCost NewCost =
921 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
922 GenericCastCost;
923
924 // Account for multi-use casts using specific costs
925 if (!LHSCast->hasOneUse())
926 NewCost += LHSCastCost;
927 if (!RHSCast->hasOneUse())
928 NewCost += RHSCastCost;
929
930 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
931 << " NewCost=" << NewCost << "\n");
932
933 if (NewCost > OldCost)
934 return false;
935
936 // Create the operation on the source type
937 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
938 BinOp->getName() + ".inner");
939 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
940 NewBinOp->copyIRFlags(BinOp);
941
942 Worklist.pushValue(NewOp);
943
944 // Create the cast operation directly to ensure we get a new instruction
945 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
946
947 // Preserve cast instruction flags
948 NewCast->copyIRFlags(LHSCast);
949 NewCast->andIRFlags(RHSCast);
950
951 // Insert the new instruction
952 Value *Result = Builder.Insert(NewCast);
953
954 replaceValue(I, *Result);
955 return true;
956}
957
958/// Match:
959// bitop(castop(x), C) ->
960// bitop(castop(x), castop(InvC)) ->
961// castop(bitop(x, InvC))
962// Supports: bitcast
963bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
965 Constant *C;
966
967 // Check if this is a bitwise logic operation
969 return false;
970
971 // Get the cast instructions
972 auto *LHSCast = dyn_cast<CastInst>(LHS);
973 if (!LHSCast)
974 return false;
975
976 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
977
978 // Only handle supported cast operations
979 switch (CastOpcode) {
980 case Instruction::BitCast:
981 case Instruction::ZExt:
982 case Instruction::SExt:
983 case Instruction::Trunc:
984 break;
985 default:
986 return false;
987 }
988
989 Value *LHSSrc = LHSCast->getOperand(0);
990
991 auto *SrcTy = LHSSrc->getType();
992 auto *DstTy = I.getType();
993 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
994 // Other casts only handle vector types with integer elements.
995 if (CastOpcode != Instruction::BitCast &&
996 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
997 return false;
998
999 // Only integer scalar/vector values are legal for bitwise logic operations.
1000 if (!SrcTy->getScalarType()->isIntegerTy() ||
1001 !DstTy->getScalarType()->isIntegerTy())
1002 return false;
1003
1004 // Find the constant InvC, such that castop(InvC) equals to C.
1005 PreservedCastFlags RHSFlags;
1006 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1007 if (!InvC)
1008 return false;
1009
1010 // Cost Check :
1011 // OldCost = bitlogic + cast
1012 // NewCost = bitlogic + cast
1013
1014 // Calculate specific costs for each cast with instruction context
1015 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1016 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1017
1018 InstructionCost OldCost =
1019 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1020
1021 // For new cost, we can't provide an instruction (it doesn't exist yet)
1022 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1023 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1024
1025 InstructionCost NewCost =
1026 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1027 GenericCastCost;
1028
1029 // Account for multi-use casts using specific costs
1030 if (!LHSCast->hasOneUse())
1031 NewCost += LHSCastCost;
1032
1033 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1034 << " NewCost=" << NewCost << "\n");
1035
1036 if (NewCost > OldCost)
1037 return false;
1038
1039 // Create the operation on the source type
1040 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1041 LHSSrc, InvC, I.getName() + ".inner");
1042 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1043 NewBinOp->copyIRFlags(&I);
1044
1045 Worklist.pushValue(NewOp);
1046
1047 // Create the cast operation directly to ensure we get a new instruction
1048 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1049
1050 // Preserve cast instruction flags
1051 if (RHSFlags.NNeg)
1052 NewCast->setNonNeg();
1053 if (RHSFlags.NUW)
1054 NewCast->setHasNoUnsignedWrap();
1055 if (RHSFlags.NSW)
1056 NewCast->setHasNoSignedWrap();
1057
1058 NewCast->andIRFlags(LHSCast);
1059
1060 // Insert the new instruction
1061 Value *Result = Builder.Insert(NewCast);
1062
1063 replaceValue(I, *Result);
1064 return true;
1065}
1066
1067/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1068/// destination type followed by shuffle. This can enable further transforms by
1069/// moving bitcasts or shuffles together.
1070bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1071 Value *V0, *V1;
1072 ArrayRef<int> Mask;
1073 if (!match(&I, m_BitCast(m_OneUse(
1074 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1075 return false;
1076
1077 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1078 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1079 // mask for scalable type is a splat or not.
1080 // 2) Disallow non-vector casts.
1081 // TODO: We could allow any shuffle.
1082 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1083 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1084 if (!DestTy || !SrcTy)
1085 return false;
1086
1087 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1088 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1089 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1090 return false;
1091
1092 bool IsUnary = isa<UndefValue>(V1);
1093
1094 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1095 // if it won't increase the number of bitcasts.
1096 if (!IsUnary) {
1099 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1100 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1101 return false;
1102 }
1103
1104 SmallVector<int, 16> NewMask;
1105 if (DestEltSize <= SrcEltSize) {
1106 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1107 // always be expanded to the equivalent form choosing narrower elements.
1108 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1109 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1110 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1111 } else {
1112 // The bitcast is from narrow elements to wide elements. The shuffle mask
1113 // must choose consecutive elements to allow casting first.
1114 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1115 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1116 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1117 return false;
1118 }
1119
1120 // Bitcast the shuffle src - keep its original width but using the destination
1121 // scalar type.
1122 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1123 auto *NewShuffleTy =
1124 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1125 auto *OldShuffleTy =
1126 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1127 unsigned NumOps = IsUnary ? 1 : 2;
1128
1129 // The new shuffle must not cost more than the old shuffle.
1133
1134 InstructionCost NewCost =
1135 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1136 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1137 TargetTransformInfo::CastContextHint::None,
1138 CostKind));
1139 InstructionCost OldCost =
1140 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1141 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1142 TargetTransformInfo::CastContextHint::None,
1143 CostKind);
1144
1145 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1146 << OldCost << " vs NewCost: " << NewCost << "\n");
1147
1148 if (NewCost > OldCost || !NewCost.isValid())
1149 return false;
1150
1151 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1152 ++NumShufOfBitcast;
1153 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1154 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1155 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1156 replaceValue(I, *Shuf);
1157 return true;
1158}
1159
1160/// VP Intrinsics whose vector operands are both splat values may be simplified
1161/// into the scalar version of the operation and the result splatted. This
1162/// can lead to scalarization down the line.
1163bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1164 if (!isa<VPIntrinsic>(I))
1165 return false;
1166 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1167 Value *Op0 = VPI.getArgOperand(0);
1168 Value *Op1 = VPI.getArgOperand(1);
1169
1170 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1171 return false;
1172
1173 // Check getSplatValue early in this function, to avoid doing unnecessary
1174 // work.
1175 Value *ScalarOp0 = getSplatValue(Op0);
1176 Value *ScalarOp1 = getSplatValue(Op1);
1177 if (!ScalarOp0 || !ScalarOp1)
1178 return false;
1179
1180 // For the binary VP intrinsics supported here, the result on disabled lanes
1181 // is a poison value. For now, only do this simplification if all lanes
1182 // are active.
1183 // TODO: Relax the condition that all lanes are active by using insertelement
1184 // on inactive lanes.
1185 auto IsAllTrueMask = [](Value *MaskVal) {
1186 if (Value *SplattedVal = getSplatValue(MaskVal))
1187 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1188 return ConstValue->isAllOnesValue();
1189 return false;
1190 };
1191 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1192 return false;
1193
1194 // Check to make sure we support scalarization of the intrinsic
1195 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1196 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1197 return false;
1198
1199 // Calculate cost of splatting both operands into vectors and the vector
1200 // intrinsic
1201 VectorType *VecTy = cast<VectorType>(VPI.getType());
1202 SmallVector<int> Mask;
1203 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1204 Mask.resize(FVTy->getNumElements(), 0);
1205 InstructionCost SplatCost =
1206 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1208 CostKind);
1209
1210 // Calculate the cost of the VP Intrinsic
1212 for (Value *V : VPI.args())
1213 Args.push_back(V->getType());
1214 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1215 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1216 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1217
1218 // Determine scalar opcode
1219 std::optional<unsigned> FunctionalOpcode =
1220 VPI.getFunctionalOpcode();
1221 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1222 if (!FunctionalOpcode) {
1223 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1224 if (!ScalarIntrID)
1225 return false;
1226 }
1227
1228 // Calculate cost of scalarizing
1229 InstructionCost ScalarOpCost = 0;
1230 if (ScalarIntrID) {
1231 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1232 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1233 } else {
1234 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1235 VecTy->getScalarType(), CostKind);
1236 }
1237
1238 // The existing splats may be kept around if other instructions use them.
1239 InstructionCost CostToKeepSplats =
1240 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1241 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1242
1243 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1244 << "\n");
1245 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1246 << ", Cost of scalarizing:" << NewCost << "\n");
1247
1248 // We want to scalarize unless the vector variant actually has lower cost.
1249 if (OldCost < NewCost || !NewCost.isValid())
1250 return false;
1251
1252 // Scalarize the intrinsic
1253 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1254 Value *EVL = VPI.getArgOperand(3);
1255
1256 // If the VP op might introduce UB or poison, we can scalarize it provided
1257 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1258 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1259 // scalarizing it.
1260 bool SafeToSpeculate;
1261 if (ScalarIntrID)
1262 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1263 .hasAttribute(Attribute::AttrKind::Speculatable);
1264 else
1266 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1267 if (!SafeToSpeculate &&
1268 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1269 return false;
1270
1271 Value *ScalarVal =
1272 ScalarIntrID
1273 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1274 {ScalarOp0, ScalarOp1})
1275 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1276 ScalarOp0, ScalarOp1);
1277
1278 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1279 return true;
1280}
1281
1282/// Match a vector op/compare/intrinsic with at least one
1283/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1284/// by insertelement.
1285bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1286 auto *UO = dyn_cast<UnaryOperator>(&I);
1287 auto *BO = dyn_cast<BinaryOperator>(&I);
1288 auto *CI = dyn_cast<CmpInst>(&I);
1289 auto *II = dyn_cast<IntrinsicInst>(&I);
1290 if (!UO && !BO && !CI && !II)
1291 return false;
1292
1293 // TODO: Allow intrinsics with different argument types
1294 if (II) {
1295 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1296 return false;
1297 for (auto [Idx, Arg] : enumerate(II->args()))
1298 if (Arg->getType() != II->getType() &&
1299 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1300 return false;
1301 }
1302
1303 // Do not convert the vector condition of a vector select into a scalar
1304 // condition. That may cause problems for codegen because of differences in
1305 // boolean formats and register-file transfers.
1306 // TODO: Can we account for that in the cost model?
1307 if (CI)
1308 for (User *U : I.users())
1309 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1310 return false;
1311
1312 // Match constant vectors or scalars being inserted into constant vectors:
1313 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1314 SmallVector<Value *> VecCs, ScalarOps;
1315 std::optional<uint64_t> Index;
1316
1317 auto Ops = II ? II->args() : I.operands();
1318 for (auto [OpNum, Op] : enumerate(Ops)) {
1319 Constant *VecC;
1320 Value *V;
1321 uint64_t InsIdx = 0;
1322 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1323 m_ConstantInt(InsIdx)))) {
1324 // Bail if any inserts are out of bounds.
1325 VectorType *OpTy = cast<VectorType>(Op->getType());
1326 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1327 return false;
1328 // All inserts must have the same index.
1329 // TODO: Deal with mismatched index constants and variable indexes?
1330 if (!Index)
1331 Index = InsIdx;
1332 else if (InsIdx != *Index)
1333 return false;
1334 VecCs.push_back(VecC);
1335 ScalarOps.push_back(V);
1336 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1337 OpNum, &TTI)) {
1338 VecCs.push_back(Op.get());
1339 ScalarOps.push_back(Op.get());
1340 } else if (match(Op.get(), m_Constant(VecC))) {
1341 VecCs.push_back(VecC);
1342 ScalarOps.push_back(nullptr);
1343 } else {
1344 return false;
1345 }
1346 }
1347
1348 // Bail if all operands are constant.
1349 if (!Index.has_value())
1350 return false;
1351
1352 VectorType *VecTy = cast<VectorType>(I.getType());
1353 Type *ScalarTy = VecTy->getScalarType();
1354 assert(VecTy->isVectorTy() &&
1355 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1356 ScalarTy->isPointerTy()) &&
1357 "Unexpected types for insert element into binop or cmp");
1358
1359 unsigned Opcode = I.getOpcode();
1360 InstructionCost ScalarOpCost, VectorOpCost;
1361 if (CI) {
1362 CmpInst::Predicate Pred = CI->getPredicate();
1363 ScalarOpCost = TTI.getCmpSelInstrCost(
1364 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1365 VectorOpCost = TTI.getCmpSelInstrCost(
1366 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1367 } else if (UO || BO) {
1368 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1369 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1370 } else {
1371 IntrinsicCostAttributes ScalarICA(
1372 II->getIntrinsicID(), ScalarTy,
1373 SmallVector<Type *>(II->arg_size(), ScalarTy));
1374 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1375 IntrinsicCostAttributes VectorICA(
1376 II->getIntrinsicID(), VecTy,
1377 SmallVector<Type *>(II->arg_size(), VecTy));
1378 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1379 }
1380
1381 // Fold the vector constants in the original vectors into a new base vector to
1382 // get more accurate cost modelling.
1383 Value *NewVecC = nullptr;
1384 if (CI)
1385 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1386 else if (UO)
1387 NewVecC =
1388 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1389 else if (BO)
1390 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1391 else if (II)
1392 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1393
1394 if (!NewVecC)
1395 return false;
1396
1397 // Get cost estimate for the insert element. This cost will factor into
1398 // both sequences.
1399 InstructionCost OldCost = VectorOpCost;
1400 InstructionCost NewCost =
1401 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1402 CostKind, *Index, NewVecC);
1403
1404 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1405 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1406 II->getIntrinsicID(), Idx, &TTI)))
1407 continue;
1409 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1410 OldCost += InsertCost;
1411 NewCost += !Op->hasOneUse() * InsertCost;
1412 }
1413
1414 // We want to scalarize unless the vector variant actually has lower cost.
1415 if (OldCost < NewCost || !NewCost.isValid())
1416 return false;
1417
1418 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1419 // inselt NewVecC, (scalar_op V0, V1), Index
1420 if (CI)
1421 ++NumScalarCmp;
1422 else if (UO || BO)
1423 ++NumScalarOps;
1424 else
1425 ++NumScalarIntrinsic;
1426
1427 // For constant cases, extract the scalar element, this should constant fold.
1428 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1429 if (!Scalar)
1431 cast<Constant>(VecC), Builder.getInt64(*Index));
1432
1433 Value *Scalar;
1434 if (CI)
1435 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1436 else if (UO || BO)
1437 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1438 else
1439 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1440
1441 Scalar->setName(I.getName() + ".scalar");
1442
1443 // All IR flags are safe to back-propagate. There is no potential for extra
1444 // poison to be created by the scalar instruction.
1445 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1446 ScalarInst->copyIRFlags(&I);
1447
1448 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1449 replaceValue(I, *Insert);
1450 return true;
1451}
1452
1453/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1454/// a vector into vector operations followed by extract. Note: The SLP pass
1455/// may miss this pattern because of implementation problems.
1456bool VectorCombine::foldExtractedCmps(Instruction &I) {
1457 auto *BI = dyn_cast<BinaryOperator>(&I);
1458
1459 // We are looking for a scalar binop of booleans.
1460 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1461 if (!BI || !I.getType()->isIntegerTy(1))
1462 return false;
1463
1464 // The compare predicates should match, and each compare should have a
1465 // constant operand.
1466 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1467 Instruction *I0, *I1;
1468 Constant *C0, *C1;
1469 CmpPredicate P0, P1;
1470 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1471 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1472 return false;
1473
1474 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1475 if (!MatchingPred)
1476 return false;
1477
1478 // The compare operands must be extracts of the same vector with constant
1479 // extract indexes.
1480 Value *X;
1481 uint64_t Index0, Index1;
1482 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1483 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1484 return false;
1485
1486 auto *Ext0 = cast<ExtractElementInst>(I0);
1487 auto *Ext1 = cast<ExtractElementInst>(I1);
1488 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1489 if (!ConvertToShuf)
1490 return false;
1491 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1492 "Unknown ExtractElementInst");
1493
1494 // The original scalar pattern is:
1495 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1496 CmpInst::Predicate Pred = *MatchingPred;
1497 unsigned CmpOpcode =
1498 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1499 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1500 if (!VecTy)
1501 return false;
1502
1503 InstructionCost Ext0Cost =
1504 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1505 InstructionCost Ext1Cost =
1506 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1508 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1509 CostKind);
1510
1511 InstructionCost OldCost =
1512 Ext0Cost + Ext1Cost + CmpCost * 2 +
1513 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1514
1515 // The proposed vector pattern is:
1516 // vcmp = cmp Pred X, VecC
1517 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1518 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1519 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1522 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1523 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1524 ShufMask[CheapIndex] = ExpensiveIndex;
1526 CmpTy, ShufMask, CostKind);
1527 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1528 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1529 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1530 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1531
1532 // Aggressively form vector ops if the cost is equal because the transform
1533 // may enable further optimization.
1534 // Codegen can reverse this transform (scalarize) if it was not profitable.
1535 if (OldCost < NewCost || !NewCost.isValid())
1536 return false;
1537
1538 // Create a vector constant from the 2 scalar constants.
1539 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1540 PoisonValue::get(VecTy->getElementType()));
1541 CmpC[Index0] = C0;
1542 CmpC[Index1] = C1;
1543 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1544 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1545 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1546 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1547 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1548 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1549 replaceValue(I, *NewExt);
1550 ++NumVecCmpBO;
1551 return true;
1552}
1553
1554/// Try to fold scalar selects that select between extracted elements and zero
1555/// into extracting from a vector select. This is rooted at the bitcast.
1556///
1557/// This pattern arises when a vector is bitcast to a smaller element type,
1558/// elements are extracted, and then conditionally selected with zero:
1559///
1560/// %bc = bitcast <4 x i32> %src to <16 x i8>
1561/// %e0 = extractelement <16 x i8> %bc, i32 0
1562/// %s0 = select i1 %cond, i8 %e0, i8 0
1563/// %e1 = extractelement <16 x i8> %bc, i32 1
1564/// %s1 = select i1 %cond, i8 %e1, i8 0
1565/// ...
1566///
1567/// Transforms to:
1568/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1569/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1570/// %e0 = extractelement <16 x i8> %bc, i32 0
1571/// %e1 = extractelement <16 x i8> %bc, i32 1
1572/// ...
1573///
1574/// This is profitable because vector select on wider types produces fewer
1575/// select/cndmask instructions than scalar selects on each element.
1576bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1577 auto *BC = dyn_cast<BitCastInst>(&I);
1578 if (!BC)
1579 return false;
1580
1581 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1582 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1583 if (!SrcVecTy || !DstVecTy)
1584 return false;
1585
1586 // Source must be 32-bit or 64-bit elements, destination must be smaller
1587 // integer elements. Zero in all these types is all-bits-zero.
1588 Type *SrcEltTy = SrcVecTy->getElementType();
1589 Type *DstEltTy = DstVecTy->getElementType();
1590 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1591 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1592
1593 if (SrcEltBits != 32 && SrcEltBits != 64)
1594 return false;
1595
1596 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1597 return false;
1598
1599 // Check profitability using TTI before collecting users.
1600 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1601 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1602
1603 InstructionCost ScalarSelCost =
1604 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1606 InstructionCost VecSelCost =
1607 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1609
1610 // We need at least this many selects for vectorization to be profitable.
1611 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1612 // ScalarSelCost
1613 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1614 return false;
1615
1616 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1617
1618 // Quick check: if bitcast doesn't have enough users, bail early.
1619 if (!BC->hasNUsesOrMore(MinSelects))
1620 return false;
1621
1622 // Collect all select users that match the pattern, grouped by condition.
1623 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1624 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1625
1626 for (User *U : BC->users()) {
1627 auto *Ext = dyn_cast<ExtractElementInst>(U);
1628 if (!Ext)
1629 continue;
1630
1631 for (User *ExtUser : Ext->users()) {
1632 Value *Cond;
1633 // Match: select i1 %cond, %ext, 0
1634 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1635 Cond->getType()->isIntegerTy(1))
1636 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1637 }
1638 }
1639
1640 if (CondToSelects.empty())
1641 return false;
1642
1643 bool MadeChange = false;
1644 Value *SrcVec = BC->getOperand(0);
1645
1646 // Process each group of selects with the same condition.
1647 for (auto [Cond, Selects] : CondToSelects) {
1648 // Only profitable if vector select cost < total scalar select cost.
1649 if (Selects.size() < MinSelects) {
1650 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1651 << "profitable (VecCost=" << VecSelCost
1652 << ", ScalarCost=" << ScalarSelCost
1653 << ", NumSelects=" << Selects.size() << ")\n");
1654 continue;
1655 }
1656
1657 // Create the vector select and bitcast once for this condition.
1658 auto InsertPt = std::next(BC->getIterator());
1659
1660 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1661 if (DT.dominates(BC, CondInst))
1662 InsertPt = std::next(CondInst->getIterator());
1663
1664 Builder.SetInsertPoint(InsertPt);
1665 Value *VecSel =
1666 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1667 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1668
1669 // Replace each scalar select with an extract from the new bitcast.
1670 for (SelectInst *Sel : Selects) {
1671 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1672 Value *Idx = Ext->getIndexOperand();
1673
1674 Builder.SetInsertPoint(Sel);
1675 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1676 replaceValue(*Sel, *NewExt);
1677 MadeChange = true;
1678 }
1679
1680 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1681 << " selects into vector select\n");
1682 }
1683
1684 return MadeChange;
1685}
1686
1689 const TargetTransformInfo &TTI,
1690 InstructionCost &CostBeforeReduction,
1691 InstructionCost &CostAfterReduction) {
1692 Instruction *Op0, *Op1;
1693 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1694 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1695 unsigned ReductionOpc =
1696 getArithmeticReductionInstruction(II.getIntrinsicID());
1697 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1698 bool IsUnsigned = isa<ZExtInst>(RedOp);
1699 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1700
1701 CostBeforeReduction =
1702 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1704 CostAfterReduction =
1705 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1706 ExtType, FastMathFlags(), CostKind);
1707 return;
1708 }
1709 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1710 match(RedOp,
1712 match(Op0, m_ZExtOrSExt(m_Value())) &&
1713 Op0->getOpcode() == Op1->getOpcode() &&
1714 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1715 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1716 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1717 bool IsUnsigned = isa<ZExtInst>(Op0);
1718 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1719 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1720
1721 InstructionCost ExtCost =
1722 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1724 InstructionCost MulCost =
1725 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1726 InstructionCost Ext2Cost =
1727 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1729
1730 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1731 CostAfterReduction = TTI.getMulAccReductionCost(
1732 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1733 return;
1734 }
1735 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1736 std::nullopt, CostKind);
1737}
1738
1739bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1740 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1741 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1742 if (BinOpOpc == Instruction::Sub)
1743 ReductionIID = Intrinsic::vector_reduce_add;
1744 if (ReductionIID == Intrinsic::not_intrinsic)
1745 return false;
1746
1747 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1748 Intrinsic::ID IID) -> Value * {
1749 auto *II = dyn_cast<IntrinsicInst>(V);
1750 if (!II)
1751 return nullptr;
1752 if (II->getIntrinsicID() == IID && II->hasOneUse())
1753 return II->getArgOperand(0);
1754 return nullptr;
1755 };
1756
1757 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1758 if (!V0)
1759 return false;
1760 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1761 if (!V1)
1762 return false;
1763
1764 auto *VTy = cast<VectorType>(V0->getType());
1765 if (V1->getType() != VTy)
1766 return false;
1767 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1768 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1769 unsigned ReductionOpc =
1770 getArithmeticReductionInstruction(II0.getIntrinsicID());
1771
1772 InstructionCost OldCost = 0;
1773 InstructionCost NewCost = 0;
1774 InstructionCost CostOfRedOperand0 = 0;
1775 InstructionCost CostOfRed0 = 0;
1776 InstructionCost CostOfRedOperand1 = 0;
1777 InstructionCost CostOfRed1 = 0;
1778 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1779 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1780 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1781 NewCost =
1782 CostOfRedOperand0 + CostOfRedOperand1 +
1783 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1784 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1785 if (NewCost >= OldCost || !NewCost.isValid())
1786 return false;
1787
1788 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1789 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1790 << "\n");
1791 Value *VectorBO;
1792 if (BinOpOpc == Instruction::Or)
1793 VectorBO = Builder.CreateOr(V0, V1, "",
1794 cast<PossiblyDisjointInst>(I).isDisjoint());
1795 else
1796 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1797
1798 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1799 replaceValue(I, *Rdx);
1800 return true;
1801}
1802
1803// Check if memory loc modified between two instrs in the same BB
1806 const MemoryLocation &Loc, AAResults &AA) {
1807 unsigned NumScanned = 0;
1808 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1809 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1810 ++NumScanned > MaxInstrsToScan;
1811 });
1812}
1813
1814namespace {
1815/// Helper class to indicate whether a vector index can be safely scalarized and
1816/// if a freeze needs to be inserted.
1817class ScalarizationResult {
1818 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1819
1820 StatusTy Status;
1821 Value *ToFreeze;
1822
1823 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1824 : Status(Status), ToFreeze(ToFreeze) {}
1825
1826public:
1827 ScalarizationResult(const ScalarizationResult &Other) = default;
1828 ~ScalarizationResult() {
1829 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1830 }
1831
1832 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1833 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1834 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1835 return {StatusTy::SafeWithFreeze, ToFreeze};
1836 }
1837
1838 /// Returns true if the index can be scalarize without requiring a freeze.
1839 bool isSafe() const { return Status == StatusTy::Safe; }
1840 /// Returns true if the index cannot be scalarized.
1841 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1842 /// Returns true if the index can be scalarize, but requires inserting a
1843 /// freeze.
1844 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1845
1846 /// Reset the state of Unsafe and clear ToFreze if set.
1847 void discard() {
1848 ToFreeze = nullptr;
1849 Status = StatusTy::Unsafe;
1850 }
1851
1852 /// Freeze the ToFreeze and update the use in \p User to use it.
1853 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1854 assert(isSafeWithFreeze() &&
1855 "should only be used when freezing is required");
1856 assert(is_contained(ToFreeze->users(), &UserI) &&
1857 "UserI must be a user of ToFreeze");
1858 IRBuilder<>::InsertPointGuard Guard(Builder);
1859 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1860 Value *Frozen =
1861 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1862 for (Use &U : make_early_inc_range((UserI.operands())))
1863 if (U.get() == ToFreeze)
1864 U.set(Frozen);
1865
1866 ToFreeze = nullptr;
1867 }
1868};
1869} // namespace
1870
1871/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1872/// Idx. \p Idx must access a valid vector element.
1873static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1874 Instruction *CtxI,
1875 AssumptionCache &AC,
1876 const DominatorTree &DT) {
1877 // We do checks for both fixed vector types and scalable vector types.
1878 // This is the number of elements of fixed vector types,
1879 // or the minimum number of elements of scalable vector types.
1880 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1881 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1882
1883 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1884 if (C->getValue().ult(NumElements))
1885 return ScalarizationResult::safe();
1886 return ScalarizationResult::unsafe();
1887 }
1888
1889 // Always unsafe if the index type can't handle all inbound values.
1890 if (!llvm::isUIntN(IntWidth, NumElements))
1891 return ScalarizationResult::unsafe();
1892
1893 APInt Zero(IntWidth, 0);
1894 APInt MaxElts(IntWidth, NumElements);
1895 ConstantRange ValidIndices(Zero, MaxElts);
1896 ConstantRange IdxRange(IntWidth, true);
1897
1898 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1899 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1900 true, &AC, CtxI, &DT)))
1901 return ScalarizationResult::safe();
1902 return ScalarizationResult::unsafe();
1903 }
1904
1905 // If the index may be poison, check if we can insert a freeze before the
1906 // range of the index is restricted.
1907 Value *IdxBase;
1908 ConstantInt *CI;
1909 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1910 IdxRange = IdxRange.binaryAnd(CI->getValue());
1911 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1912 IdxRange = IdxRange.urem(CI->getValue());
1913 }
1914
1915 if (ValidIndices.contains(IdxRange))
1916 return ScalarizationResult::safeWithFreeze(IdxBase);
1917 return ScalarizationResult::unsafe();
1918}
1919
1920/// The memory operation on a vector of \p ScalarType had alignment of
1921/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1922/// alignment that will be valid for the memory operation on a single scalar
1923/// element of the same type with index \p Idx.
1925 Type *ScalarType, Value *Idx,
1926 const DataLayout &DL) {
1927 if (auto *C = dyn_cast<ConstantInt>(Idx))
1928 return commonAlignment(VectorAlignment,
1929 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1930 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1931}
1932
1933// Combine patterns like:
1934// %0 = load <4 x i32>, <4 x i32>* %a
1935// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1936// store <4 x i32> %1, <4 x i32>* %a
1937// to:
1938// %0 = bitcast <4 x i32>* %a to i32*
1939// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1940// store i32 %b, i32* %1
1941bool VectorCombine::foldSingleElementStore(Instruction &I) {
1943 return false;
1944 auto *SI = cast<StoreInst>(&I);
1945 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1946 return false;
1947
1948 // TODO: Combine more complicated patterns (multiple insert) by referencing
1949 // TargetTransformInfo.
1951 Value *NewElement;
1952 Value *Idx;
1953 if (!match(SI->getValueOperand(),
1954 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1955 m_Value(Idx))))
1956 return false;
1957
1958 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1959 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1960 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1961 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1962 // modified between, vector type matches store size, and index is inbounds.
1963 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1964 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1965 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1966 return false;
1967
1968 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1969 if (ScalarizableIdx.isUnsafe() ||
1970 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1971 MemoryLocation::get(SI), AA))
1972 return false;
1973
1974 // Ensure we add the load back to the worklist BEFORE its users so they can
1975 // erased in the correct order.
1976 Worklist.push(Load);
1977
1978 if (ScalarizableIdx.isSafeWithFreeze())
1979 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1980 Value *GEP = Builder.CreateInBoundsGEP(
1981 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1982 {ConstantInt::get(Idx->getType(), 0), Idx});
1983 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1984 NSI->copyMetadata(*SI);
1985 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1986 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1987 *DL);
1988 NSI->setAlignment(ScalarOpAlignment);
1989 replaceValue(I, *NSI);
1991 return true;
1992 }
1993
1994 return false;
1995}
1996
1997/// Try to scalarize vector loads feeding extractelement or bitcast
1998/// instructions.
1999bool VectorCombine::scalarizeLoad(Instruction &I) {
2000 Value *Ptr;
2001 if (!match(&I, m_Load(m_Value(Ptr))))
2002 return false;
2003
2004 auto *LI = cast<LoadInst>(&I);
2005 auto *VecTy = cast<VectorType>(LI->getType());
2006 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2007 return false;
2008
2009 bool AllExtracts = true;
2010 bool AllBitcasts = true;
2011 Instruction *LastCheckedInst = LI;
2012 unsigned NumInstChecked = 0;
2013
2014 // Check what type of users we have (must either all be extracts or
2015 // bitcasts) and ensure no memory modifications between the load and
2016 // its users.
2017 for (User *U : LI->users()) {
2018 auto *UI = dyn_cast<Instruction>(U);
2019 if (!UI || UI->getParent() != LI->getParent())
2020 return false;
2021
2022 // If any user is waiting to be erased, then bail out as this will
2023 // distort the cost calculation and possibly lead to infinite loops.
2024 if (UI->use_empty())
2025 return false;
2026
2027 if (!isa<ExtractElementInst>(UI))
2028 AllExtracts = false;
2029 if (!isa<BitCastInst>(UI))
2030 AllBitcasts = false;
2031
2032 // Check if any instruction between the load and the user may modify memory.
2033 if (LastCheckedInst->comesBefore(UI)) {
2034 for (Instruction &I :
2035 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2036 // Bail out if we reached the check limit or the instruction may write
2037 // to memory.
2038 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2039 return false;
2040 NumInstChecked++;
2041 }
2042 LastCheckedInst = UI;
2043 }
2044 }
2045
2046 if (AllExtracts)
2047 return scalarizeLoadExtract(LI, VecTy, Ptr);
2048 if (AllBitcasts)
2049 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2050 return false;
2051}
2052
2053/// Try to scalarize vector loads feeding extractelement instructions.
2054bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2055 Value *Ptr) {
2057 return false;
2058
2059 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2060 llvm::scope_exit FailureGuard([&]() {
2061 // If the transform is aborted, discard the ScalarizationResults.
2062 for (auto &Pair : NeedFreeze)
2063 Pair.second.discard();
2064 });
2065
2066 InstructionCost OriginalCost =
2067 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2069 InstructionCost ScalarizedCost = 0;
2070
2071 for (User *U : LI->users()) {
2072 auto *UI = cast<ExtractElementInst>(U);
2073
2074 auto ScalarIdx =
2075 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
2076 if (ScalarIdx.isUnsafe())
2077 return false;
2078 if (ScalarIdx.isSafeWithFreeze()) {
2079 NeedFreeze.try_emplace(UI, ScalarIdx);
2080 ScalarIdx.discard();
2081 }
2082
2083 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2084 OriginalCost +=
2085 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2086 Index ? Index->getZExtValue() : -1);
2087 ScalarizedCost +=
2088 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2090 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2091 nullptr, nullptr, CostKind);
2092 }
2093
2094 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2095 << "\n LoadExtractCost: " << OriginalCost
2096 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2097
2098 if (ScalarizedCost >= OriginalCost)
2099 return false;
2100
2101 // Ensure we add the load back to the worklist BEFORE its users so they can
2102 // erased in the correct order.
2103 Worklist.push(LI);
2104
2105 Type *ElemType = VecTy->getElementType();
2106
2107 // Replace extracts with narrow scalar loads.
2108 for (User *U : LI->users()) {
2109 auto *EI = cast<ExtractElementInst>(U);
2110 Value *Idx = EI->getIndexOperand();
2111
2112 // Insert 'freeze' for poison indexes.
2113 auto It = NeedFreeze.find(EI);
2114 if (It != NeedFreeze.end())
2115 It->second.freeze(Builder, *cast<Instruction>(Idx));
2116
2117 Builder.SetInsertPoint(EI);
2118 Value *GEP =
2119 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2120 auto *NewLoad = cast<LoadInst>(
2121 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2122
2123 Align ScalarOpAlignment =
2124 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2125 NewLoad->setAlignment(ScalarOpAlignment);
2126
2127 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2128 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2129 AAMDNodes OldAAMD = LI->getAAMetadata();
2130 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2131 }
2132
2133 replaceValue(*EI, *NewLoad, false);
2134 }
2135
2136 FailureGuard.release();
2137 return true;
2138}
2139
2140/// Try to scalarize vector loads feeding bitcast instructions.
2141bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2142 Value *Ptr) {
2143 InstructionCost OriginalCost =
2144 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2146
2147 Type *TargetScalarType = nullptr;
2148 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2149
2150 for (User *U : LI->users()) {
2151 auto *BC = cast<BitCastInst>(U);
2152
2153 Type *DestTy = BC->getDestTy();
2154 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2155 return false;
2156
2157 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2158 if (DestBitWidth != VecBitWidth)
2159 return false;
2160
2161 // All bitcasts must target the same scalar type.
2162 if (!TargetScalarType)
2163 TargetScalarType = DestTy;
2164 else if (TargetScalarType != DestTy)
2165 return false;
2166
2167 OriginalCost +=
2168 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2170 }
2171
2172 if (!TargetScalarType)
2173 return false;
2174
2175 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2176 InstructionCost ScalarizedCost =
2177 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2179
2180 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2181 << "\n OriginalCost: " << OriginalCost
2182 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2183
2184 if (ScalarizedCost >= OriginalCost)
2185 return false;
2186
2187 // Ensure we add the load back to the worklist BEFORE its users so they can
2188 // erased in the correct order.
2189 Worklist.push(LI);
2190
2191 Builder.SetInsertPoint(LI);
2192 auto *ScalarLoad =
2193 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2194 ScalarLoad->setAlignment(LI->getAlign());
2195 ScalarLoad->copyMetadata(*LI);
2196
2197 // Replace all bitcast users with the scalar load.
2198 for (User *U : LI->users()) {
2199 auto *BC = cast<BitCastInst>(U);
2200 replaceValue(*BC, *ScalarLoad, false);
2201 }
2202
2203 return true;
2204}
2205
2206bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2208 return false;
2209 auto *Ext = dyn_cast<ZExtInst>(&I);
2210 if (!Ext)
2211 return false;
2212
2213 // Try to convert a vector zext feeding only extracts to a set of scalar
2214 // (Src << ExtIdx *Size) & (Size -1)
2215 // if profitable .
2216 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2217 if (!SrcTy)
2218 return false;
2219 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2220
2221 Type *ScalarDstTy = DstTy->getElementType();
2222 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2223 return false;
2224
2225 InstructionCost VectorCost =
2226 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2228 unsigned ExtCnt = 0;
2229 bool ExtLane0 = false;
2230 for (User *U : Ext->users()) {
2231 uint64_t Idx;
2232 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2233 return false;
2234 if (cast<Instruction>(U)->use_empty())
2235 continue;
2236 ExtCnt += 1;
2237 ExtLane0 |= !Idx;
2238 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2239 CostKind, Idx, U);
2240 }
2241
2242 InstructionCost ScalarCost =
2243 ExtCnt * TTI.getArithmeticInstrCost(
2244 Instruction::And, ScalarDstTy, CostKind,
2247 (ExtCnt - ExtLane0) *
2249 Instruction::LShr, ScalarDstTy, CostKind,
2252 if (ScalarCost > VectorCost)
2253 return false;
2254
2255 Value *ScalarV = Ext->getOperand(0);
2256 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2257 &DT)) {
2258 // Check wether all lanes are extracted, all extracts trigger UB
2259 // on poison, and the last extract (and hence all previous ones)
2260 // are guaranteed to execute if Ext executes. If so, we do not
2261 // need to insert a freeze.
2262 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2263 bool AllExtractsTriggerUB = true;
2264 ExtractElementInst *LastExtract = nullptr;
2265 BasicBlock *ExtBB = Ext->getParent();
2266 for (User *U : Ext->users()) {
2267 auto *Extract = cast<ExtractElementInst>(U);
2268 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2269 AllExtractsTriggerUB = false;
2270 break;
2271 }
2272 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2273 if (!LastExtract || LastExtract->comesBefore(Extract))
2274 LastExtract = Extract;
2275 }
2276 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2277 !AllExtractsTriggerUB ||
2279 LastExtract->getIterator()))
2280 ScalarV = Builder.CreateFreeze(ScalarV);
2281 }
2282 ScalarV = Builder.CreateBitCast(
2283 ScalarV,
2284 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2285 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2286 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2287 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2288 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2289 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2290 for (User *U : Ext->users()) {
2291 auto *Extract = cast<ExtractElementInst>(U);
2292 uint64_t Idx =
2293 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2294 uint64_t ShiftAmt =
2295 DL->isBigEndian()
2296 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2297 : (Idx * SrcEltSizeInBits);
2298 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2299 Value *And = Builder.CreateAnd(LShr, Mask);
2300 U->replaceAllUsesWith(And);
2301 }
2302 return true;
2303}
2304
2305/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2306/// to "(bitcast (concat X, Y))"
2307/// where X/Y are bitcasted from i1 mask vectors.
2308bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2309 Type *Ty = I.getType();
2310 if (!Ty->isIntegerTy())
2311 return false;
2312
2313 // TODO: Add big endian test coverage
2314 if (DL->isBigEndian())
2315 return false;
2316
2317 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2318 Instruction *X, *Y;
2320 return false;
2321
2322 // Allow both sources to contain shl, to handle more generic pattern:
2323 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2324 Value *SrcX;
2325 uint64_t ShAmtX = 0;
2326 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2327 !match(X, m_OneUse(
2329 m_ConstantInt(ShAmtX)))))
2330 return false;
2331
2332 Value *SrcY;
2333 uint64_t ShAmtY = 0;
2334 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2335 !match(Y, m_OneUse(
2337 m_ConstantInt(ShAmtY)))))
2338 return false;
2339
2340 // Canonicalize larger shift to the RHS.
2341 if (ShAmtX > ShAmtY) {
2342 std::swap(X, Y);
2343 std::swap(SrcX, SrcY);
2344 std::swap(ShAmtX, ShAmtY);
2345 }
2346
2347 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2348 // difference is the mask width so they can be easily concatenated together.
2349 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2350 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2351 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2352 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2353 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2354 !MaskTy->getElementType()->isIntegerTy(1) ||
2355 MaskTy->getNumElements() != ShAmtDiff ||
2356 MaskTy->getNumElements() > (BitWidth / 2))
2357 return false;
2358
2359 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2360 auto *ConcatIntTy =
2361 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2362 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2363
2364 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2365 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2366
2367 // TODO: Is it worth supporting multi use cases?
2368 InstructionCost OldCost = 0;
2369 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2370 OldCost +=
2371 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2372 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2374 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2376
2377 InstructionCost NewCost = 0;
2379 MaskTy, ConcatMask, CostKind);
2380 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2382 if (Ty != ConcatIntTy)
2383 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2385 if (ShAmtX > 0)
2386 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2387
2388 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2389 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2390 << "\n");
2391
2392 if (NewCost > OldCost)
2393 return false;
2394
2395 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2396 // any residual zero-extension or shifting.
2397 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2398 Worklist.pushValue(Concat);
2399
2400 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2401
2402 if (Ty != ConcatIntTy) {
2403 Worklist.pushValue(Result);
2404 Result = Builder.CreateZExt(Result, Ty);
2405 }
2406
2407 if (ShAmtX > 0) {
2408 Worklist.pushValue(Result);
2409 Result = Builder.CreateShl(Result, ShAmtX);
2410 }
2411
2412 replaceValue(I, *Result);
2413 return true;
2414}
2415
2416/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2417/// --> "binop (shuffle), (shuffle)".
2418bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2419 BinaryOperator *BinOp;
2420 ArrayRef<int> OuterMask;
2421 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2422 return false;
2423
2424 // Don't introduce poison into div/rem.
2425 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2426 return false;
2427
2428 Value *Op00, *Op01, *Op10, *Op11;
2429 ArrayRef<int> Mask0, Mask1;
2430 bool Match0 = match(BinOp->getOperand(0),
2431 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2432 bool Match1 = match(BinOp->getOperand(1),
2433 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2434 if (!Match0 && !Match1)
2435 return false;
2436
2437 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2438 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2439 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2440 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2441
2442 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2443 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2444 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2445 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2446 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2447 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2448 return false;
2449
2450 unsigned NumSrcElts = BinOpTy->getNumElements();
2451
2452 // Don't accept shuffles that reference the second operand in
2453 // div/rem or if its an undef arg.
2454 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2455 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2456 return false;
2457
2458 // Merge outer / inner (or identity if no match) shuffles.
2459 SmallVector<int> NewMask0, NewMask1;
2460 for (int M : OuterMask) {
2461 if (M < 0 || M >= (int)NumSrcElts) {
2462 NewMask0.push_back(PoisonMaskElem);
2463 NewMask1.push_back(PoisonMaskElem);
2464 } else {
2465 NewMask0.push_back(Match0 ? Mask0[M] : M);
2466 NewMask1.push_back(Match1 ? Mask1[M] : M);
2467 }
2468 }
2469
2470 unsigned NumOpElts = Op0Ty->getNumElements();
2471 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2472 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2473 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2474 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2475 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2476 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2477
2478 InstructionCost NewCost = 0;
2479 // Try to merge shuffles across the binop if the new shuffles are not costly.
2480 InstructionCost BinOpCost =
2481 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2482 InstructionCost OldCost =
2484 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2485 0, nullptr, {BinOp}, &I);
2486 if (!BinOp->hasOneUse())
2487 NewCost += BinOpCost;
2488
2489 if (Match0) {
2491 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2492 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2493 OldCost += Shuf0Cost;
2494 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2495 NewCost += Shuf0Cost;
2496 }
2497 if (Match1) {
2499 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2500 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2501 OldCost += Shuf1Cost;
2502 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2503 NewCost += Shuf1Cost;
2504 }
2505
2506 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2507
2508 if (!IsIdentity0)
2509 NewCost +=
2511 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2512 if (!IsIdentity1)
2513 NewCost +=
2515 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2516
2517 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2518 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2519 << "\n");
2520
2521 // If costs are equal, still fold as we reduce instruction count.
2522 if (NewCost > OldCost)
2523 return false;
2524
2525 Value *LHS =
2526 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2527 Value *RHS =
2528 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2529 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2530
2531 // Intersect flags from the old binops.
2532 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2533 NewInst->copyIRFlags(BinOp);
2534
2535 Worklist.pushValue(LHS);
2536 Worklist.pushValue(RHS);
2537 replaceValue(I, *NewBO);
2538 return true;
2539}
2540
2541/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2542/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2543bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2544 ArrayRef<int> OldMask;
2545 Instruction *LHS, *RHS;
2547 m_Mask(OldMask))))
2548 return false;
2549
2550 // TODO: Add support for addlike etc.
2551 if (LHS->getOpcode() != RHS->getOpcode())
2552 return false;
2553
2554 Value *X, *Y, *Z, *W;
2555 bool IsCommutative = false;
2556 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2557 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2558 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2559 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2560 auto *BO = cast<BinaryOperator>(LHS);
2561 // Don't introduce poison into div/rem.
2562 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2563 return false;
2564 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2565 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2566 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2567 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2568 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2569 } else
2570 return false;
2571
2572 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2573 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2574 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2575 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2576 return false;
2577
2578 bool SameBinOp = LHS == RHS;
2579 unsigned NumSrcElts = BinOpTy->getNumElements();
2580
2581 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2582 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2583 std::swap(X, Y);
2584
2585 auto ConvertToUnary = [NumSrcElts](int &M) {
2586 if (M >= (int)NumSrcElts)
2587 M -= NumSrcElts;
2588 };
2589
2590 SmallVector<int> NewMask0(OldMask);
2592 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2593 if (X == Z) {
2594 llvm::for_each(NewMask0, ConvertToUnary);
2596 Z = PoisonValue::get(BinOpTy);
2597 }
2598
2599 SmallVector<int> NewMask1(OldMask);
2601 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2602 if (Y == W) {
2603 llvm::for_each(NewMask1, ConvertToUnary);
2605 W = PoisonValue::get(BinOpTy);
2606 }
2607
2608 // Try to replace a binop with a shuffle if the shuffle is not costly.
2609 // When SameBinOp, only count the binop cost once.
2612
2613 InstructionCost OldCost = LHSCost;
2614 if (!SameBinOp) {
2615 OldCost += RHSCost;
2616 }
2618 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2619 nullptr, {LHS, RHS}, &I);
2620
2621 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2622 // where one use shuffles have gotten split across the binop/cmp. These
2623 // often allow a major reduction in total cost that wouldn't happen as
2624 // individual folds.
2625 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2626 TTI::TargetCostKind CostKind) -> bool {
2627 Value *InnerOp;
2628 ArrayRef<int> InnerMask;
2629 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2630 m_Mask(InnerMask)))) &&
2631 InnerOp->getType() == Op->getType() &&
2632 all_of(InnerMask,
2633 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2634 for (int &M : Mask)
2635 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2636 M = InnerMask[M - Offset];
2637 M = 0 <= M ? M + Offset : M;
2638 }
2640 Op = InnerOp;
2641 return true;
2642 }
2643 return false;
2644 };
2645 bool ReducedInstCount = false;
2646 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2647 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2648 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2649 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2650 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2651 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2652 // original binop(s). If binops have multiple uses, they won't be eliminated.
2653 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2654
2655 auto *ShuffleCmpTy =
2656 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2658 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2659 if (!SingleSrcBinOp)
2660 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2661 CostKind, 0, nullptr, {Y, W});
2662
2663 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2664 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2665 CostKind, Op0Info, Op1Info);
2666 } else {
2667 NewCost +=
2668 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2669 PredLHS, CostKind, Op0Info, Op1Info);
2670 }
2671 // If LHS/RHS have other uses, we need to account for the cost of keeping
2672 // the original instructions. When SameBinOp, only add the cost once.
2673 if (!LHS->hasOneUser())
2674 NewCost += LHSCost;
2675 if (!SameBinOp && !RHS->hasOneUser())
2676 NewCost += RHSCost;
2677
2678 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2679 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2680 << "\n");
2681
2682 // If either shuffle will constant fold away, then fold for the same cost as
2683 // we will reduce the instruction count.
2684 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2685 (isa<Constant>(Y) && isa<Constant>(W));
2686 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2687 return false;
2688
2689 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2690 Value *Shuf1 =
2691 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2692 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2693 ? Builder.CreateBinOp(
2694 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2695 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2696
2697 // Intersect flags from the old binops.
2698 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2699 NewInst->copyIRFlags(LHS);
2700 NewInst->andIRFlags(RHS);
2701 }
2702
2703 Worklist.pushValue(Shuf0);
2704 Worklist.pushValue(Shuf1);
2705 replaceValue(I, *NewBO);
2706 return true;
2707}
2708
2709/// Try to convert,
2710/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2711/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2712bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2713 ArrayRef<int> Mask;
2714 Value *C1, *T1, *F1, *C2, *T2, *F2;
2715 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2716 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2717 m_Mask(Mask))))
2718 return false;
2719
2720 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2721 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2722
2723 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2724 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2725 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2726 return false;
2727
2728 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2729 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2730 // SelectInsts must have the same FMF.
2731 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2732 ((SI0FOp != nullptr) &&
2733 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2734 return false;
2735
2736 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2737 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2739 auto SelOp = Instruction::Select;
2740
2742 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2744 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2745
2746 InstructionCost OldCost =
2747 CostSel1 + CostSel2 +
2748 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2749 {I.getOperand(0), I.getOperand(1)}, &I);
2750
2752 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2753 Mask, CostKind, 0, nullptr, {C1, C2});
2754 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2755 nullptr, {T1, T2});
2756 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2757 nullptr, {F1, F2});
2758 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2759 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2760 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2762
2763 if (!Sel1->hasOneUse())
2764 NewCost += CostSel1;
2765 if (!Sel2->hasOneUse())
2766 NewCost += CostSel2;
2767
2768 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2769 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2770 << "\n");
2771 if (NewCost > OldCost)
2772 return false;
2773
2774 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2775 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2776 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2777 Value *NewSel;
2778 // We presuppose that the SelectInsts have the same FMF.
2779 if (SI0FOp)
2780 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2781 SI0FOp->getFastMathFlags());
2782 else
2783 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2784
2785 Worklist.pushValue(ShuffleCmp);
2786 Worklist.pushValue(ShuffleTrue);
2787 Worklist.pushValue(ShuffleFalse);
2788 replaceValue(I, *NewSel);
2789 return true;
2790}
2791
2792/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2793/// into "castop (shuffle)".
2794bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2795 Value *V0, *V1;
2796 ArrayRef<int> OldMask;
2797 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2798 return false;
2799
2800 // Check whether this is a binary shuffle.
2801 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2802
2803 auto *C0 = dyn_cast<CastInst>(V0);
2804 auto *C1 = dyn_cast<CastInst>(V1);
2805 if (!C0 || (IsBinaryShuffle && !C1))
2806 return false;
2807
2808 Instruction::CastOps Opcode = C0->getOpcode();
2809
2810 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2811 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2812 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2813 return false;
2814
2815 if (IsBinaryShuffle) {
2816 if (C0->getSrcTy() != C1->getSrcTy())
2817 return false;
2818 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2819 if (Opcode != C1->getOpcode()) {
2820 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2821 Opcode = Instruction::SExt;
2822 else
2823 return false;
2824 }
2825 }
2826
2827 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2828 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2829 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2830 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2831 return false;
2832
2833 unsigned NumSrcElts = CastSrcTy->getNumElements();
2834 unsigned NumDstElts = CastDstTy->getNumElements();
2835 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2836 "Only bitcasts expected to alter src/dst element counts");
2837
2838 // Check for bitcasting of unscalable vector types.
2839 // e.g. <32 x i40> -> <40 x i32>
2840 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2841 (NumDstElts % NumSrcElts) != 0)
2842 return false;
2843
2844 SmallVector<int, 16> NewMask;
2845 if (NumSrcElts >= NumDstElts) {
2846 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2847 // always be expanded to the equivalent form choosing narrower elements.
2848 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2849 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2850 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2851 } else {
2852 // The bitcast is from narrow elements to wide elements. The shuffle mask
2853 // must choose consecutive elements to allow casting first.
2854 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2855 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2856 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2857 return false;
2858 }
2859
2860 auto *NewShuffleDstTy =
2861 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2862
2863 // Try to replace a castop with a shuffle if the shuffle is not costly.
2864 InstructionCost CostC0 =
2865 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2867
2869 if (IsBinaryShuffle)
2871 else
2873
2874 InstructionCost OldCost = CostC0;
2875 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2876 CostKind, 0, nullptr, {}, &I);
2877
2878 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2879 CastSrcTy, NewMask, CostKind);
2880 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2882 if (!C0->hasOneUse())
2883 NewCost += CostC0;
2884 if (IsBinaryShuffle) {
2885 InstructionCost CostC1 =
2886 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2888 OldCost += CostC1;
2889 if (!C1->hasOneUse())
2890 NewCost += CostC1;
2891 }
2892
2893 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2894 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2895 << "\n");
2896 if (NewCost > OldCost)
2897 return false;
2898
2899 Value *Shuf;
2900 if (IsBinaryShuffle)
2901 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2902 NewMask);
2903 else
2904 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2905
2906 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2907
2908 // Intersect flags from the old casts.
2909 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2910 NewInst->copyIRFlags(C0);
2911 if (IsBinaryShuffle)
2912 NewInst->andIRFlags(C1);
2913 }
2914
2915 Worklist.pushValue(Shuf);
2916 replaceValue(I, *Cast);
2917 return true;
2918}
2919
2920/// Try to convert any of:
2921/// "shuffle (shuffle x, y), (shuffle y, x)"
2922/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2923/// "shuffle (shuffle x, undef), y"
2924/// "shuffle x, (shuffle y, undef)"
2925/// into "shuffle x, y".
2926bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2927 ArrayRef<int> OuterMask;
2928 Value *OuterV0, *OuterV1;
2929 if (!match(&I,
2930 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2931 return false;
2932
2933 ArrayRef<int> InnerMask0, InnerMask1;
2934 Value *X0, *X1, *Y0, *Y1;
2935 bool Match0 =
2936 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2937 bool Match1 =
2938 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2939 if (!Match0 && !Match1)
2940 return false;
2941
2942 // If the outer shuffle is a permute, then create a fake inner all-poison
2943 // shuffle. This is easier than accounting for length-changing shuffles below.
2944 SmallVector<int, 16> PoisonMask1;
2945 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2946 X1 = X0;
2947 Y1 = Y0;
2948 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2949 InnerMask1 = PoisonMask1;
2950 Match1 = true; // fake match
2951 }
2952
2953 X0 = Match0 ? X0 : OuterV0;
2954 Y0 = Match0 ? Y0 : OuterV0;
2955 X1 = Match1 ? X1 : OuterV1;
2956 Y1 = Match1 ? Y1 : OuterV1;
2957 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2958 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2959 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2960 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2961 X0->getType() != X1->getType())
2962 return false;
2963
2964 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2965 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2966
2967 // Attempt to merge shuffles, matching upto 2 source operands.
2968 // Replace index to a poison arg with PoisonMaskElem.
2969 // Bail if either inner masks reference an undef arg.
2970 SmallVector<int, 16> NewMask(OuterMask);
2971 Value *NewX = nullptr, *NewY = nullptr;
2972 for (int &M : NewMask) {
2973 Value *Src = nullptr;
2974 if (0 <= M && M < (int)NumImmElts) {
2975 Src = OuterV0;
2976 if (Match0) {
2977 M = InnerMask0[M];
2978 Src = M >= (int)NumSrcElts ? Y0 : X0;
2979 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2980 }
2981 } else if (M >= (int)NumImmElts) {
2982 Src = OuterV1;
2983 M -= NumImmElts;
2984 if (Match1) {
2985 M = InnerMask1[M];
2986 Src = M >= (int)NumSrcElts ? Y1 : X1;
2987 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2988 }
2989 }
2990 if (Src && M != PoisonMaskElem) {
2991 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2992 if (isa<UndefValue>(Src)) {
2993 // We've referenced an undef element - if its poison, update the shuffle
2994 // mask, else bail.
2995 if (!isa<PoisonValue>(Src))
2996 return false;
2997 M = PoisonMaskElem;
2998 continue;
2999 }
3000 if (!NewX || NewX == Src) {
3001 NewX = Src;
3002 continue;
3003 }
3004 if (!NewY || NewY == Src) {
3005 M += NumSrcElts;
3006 NewY = Src;
3007 continue;
3008 }
3009 return false;
3010 }
3011 }
3012
3013 if (!NewX)
3014 return PoisonValue::get(ShuffleDstTy);
3015 if (!NewY)
3016 NewY = PoisonValue::get(ShuffleSrcTy);
3017
3018 // Have we folded to an Identity shuffle?
3019 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3020 replaceValue(I, *NewX);
3021 return true;
3022 }
3023
3024 // Try to merge the shuffles if the new shuffle is not costly.
3025 InstructionCost InnerCost0 = 0;
3026 if (Match0)
3027 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3028
3029 InstructionCost InnerCost1 = 0;
3030 if (Match1)
3031 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3032
3034
3035 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3036
3037 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3041 InstructionCost NewCost =
3042 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3043 nullptr, {NewX, NewY});
3044 if (!OuterV0->hasOneUse())
3045 NewCost += InnerCost0;
3046 if (!OuterV1->hasOneUse())
3047 NewCost += InnerCost1;
3048
3049 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3050 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3051 << "\n");
3052 if (NewCost > OldCost)
3053 return false;
3054
3055 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3056 replaceValue(I, *Shuf);
3057 return true;
3058}
3059
3060/// Try to convert a chain of length-preserving shuffles that are fed by
3061/// length-changing shuffles from the same source, e.g. a chain of length 3:
3062///
3063/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3064/// (shuffle y, undef)),
3065// (shuffle y, undef)"
3066///
3067/// into a single shuffle fed by a length-changing shuffle:
3068///
3069/// "shuffle x, (shuffle y, undef)"
3070///
3071/// Such chains arise e.g. from folding extract/insert sequences.
3072bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3073 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3074 if (!TrunkType)
3075 return false;
3076
3077 unsigned ChainLength = 0;
3078 SmallVector<int> Mask;
3079 SmallVector<int> YMask;
3080 InstructionCost OldCost = 0;
3081 InstructionCost NewCost = 0;
3082 Value *Trunk = &I;
3083 unsigned NumTrunkElts = TrunkType->getNumElements();
3084 Value *Y = nullptr;
3085
3086 for (;;) {
3087 // Match the current trunk against (commutations of) the pattern
3088 // "shuffle trunk', (shuffle y, undef)"
3089 ArrayRef<int> OuterMask;
3090 Value *OuterV0, *OuterV1;
3091 if (ChainLength != 0 && !Trunk->hasOneUse())
3092 break;
3093 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3094 m_Mask(OuterMask))))
3095 break;
3096 if (OuterV0->getType() != TrunkType) {
3097 // This shuffle is not length-preserving, so it cannot be part of the
3098 // chain.
3099 break;
3100 }
3101
3102 ArrayRef<int> InnerMask0, InnerMask1;
3103 Value *A0, *A1, *B0, *B1;
3104 bool Match0 =
3105 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3106 bool Match1 =
3107 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3108 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3109 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3110 if (Match0Leaf == Match1Leaf) {
3111 // Only handle the case of exactly one leaf in each step. The "two leaves"
3112 // case is handled by foldShuffleOfShuffles.
3113 break;
3114 }
3115
3116 SmallVector<int> CommutedOuterMask;
3117 if (Match0Leaf) {
3118 std::swap(OuterV0, OuterV1);
3119 std::swap(InnerMask0, InnerMask1);
3120 std::swap(A0, A1);
3121 std::swap(B0, B1);
3122 llvm::append_range(CommutedOuterMask, OuterMask);
3123 for (int &M : CommutedOuterMask) {
3124 if (M == PoisonMaskElem)
3125 continue;
3126 if (M < (int)NumTrunkElts)
3127 M += NumTrunkElts;
3128 else
3129 M -= NumTrunkElts;
3130 }
3131 OuterMask = CommutedOuterMask;
3132 }
3133 if (!OuterV1->hasOneUse())
3134 break;
3135
3136 if (!isa<UndefValue>(A1)) {
3137 if (!Y)
3138 Y = A1;
3139 else if (Y != A1)
3140 break;
3141 }
3142 if (!isa<UndefValue>(B1)) {
3143 if (!Y)
3144 Y = B1;
3145 else if (Y != B1)
3146 break;
3147 }
3148
3149 auto *YType = cast<FixedVectorType>(A1->getType());
3150 int NumLeafElts = YType->getNumElements();
3151 SmallVector<int> LocalYMask(InnerMask1);
3152 for (int &M : LocalYMask) {
3153 if (M >= NumLeafElts)
3154 M -= NumLeafElts;
3155 }
3156
3157 InstructionCost LocalOldCost =
3160
3161 // Handle the initial (start of chain) case.
3162 if (!ChainLength) {
3163 Mask.assign(OuterMask);
3164 YMask.assign(LocalYMask);
3165 OldCost = NewCost = LocalOldCost;
3166 Trunk = OuterV0;
3167 ChainLength++;
3168 continue;
3169 }
3170
3171 // For the non-root case, first attempt to combine masks.
3172 SmallVector<int> NewYMask(YMask);
3173 bool Valid = true;
3174 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3175 if (LeafM == -1 || CombinedM == LeafM)
3176 continue;
3177 if (CombinedM == -1) {
3178 CombinedM = LeafM;
3179 } else {
3180 Valid = false;
3181 break;
3182 }
3183 }
3184 if (!Valid)
3185 break;
3186
3187 SmallVector<int> NewMask;
3188 NewMask.reserve(NumTrunkElts);
3189 for (int M : Mask) {
3190 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3191 NewMask.push_back(M);
3192 else
3193 NewMask.push_back(OuterMask[M]);
3194 }
3195
3196 // Break the chain if adding this new step complicates the shuffles such
3197 // that it would increase the new cost by more than the old cost of this
3198 // step.
3199 InstructionCost LocalNewCost =
3201 YType, NewYMask, CostKind) +
3203 TrunkType, NewMask, CostKind);
3204
3205 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3206 break;
3207
3208 LLVM_DEBUG({
3209 if (ChainLength == 1) {
3210 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3211 << I << '\n';
3212 }
3213 dbgs() << " next chain link: " << *Trunk << '\n'
3214 << " old cost: " << (OldCost + LocalOldCost)
3215 << " new cost: " << LocalNewCost << '\n';
3216 });
3217
3218 Mask = NewMask;
3219 YMask = NewYMask;
3220 OldCost += LocalOldCost;
3221 NewCost = LocalNewCost;
3222 Trunk = OuterV0;
3223 ChainLength++;
3224 }
3225 if (ChainLength <= 1)
3226 return false;
3227
3228 if (llvm::all_of(Mask, [&](int M) {
3229 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3230 })) {
3231 // Produce a canonical simplified form if all elements are sourced from Y.
3232 for (int &M : Mask) {
3233 if (M >= static_cast<int>(NumTrunkElts))
3234 M = YMask[M - NumTrunkElts];
3235 }
3236 Value *Root =
3237 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3238 replaceValue(I, *Root);
3239 return true;
3240 }
3241
3242 Value *Leaf =
3243 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3244 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3245 replaceValue(I, *Root);
3246 return true;
3247}
3248
3249/// Try to convert
3250/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3251bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3252 Value *V0, *V1;
3253 ArrayRef<int> OldMask;
3254 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3255 return false;
3256
3257 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3258 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3259 if (!II0 || !II1)
3260 return false;
3261
3262 Intrinsic::ID IID = II0->getIntrinsicID();
3263 if (IID != II1->getIntrinsicID())
3264 return false;
3265 InstructionCost CostII0 =
3266 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3267 InstructionCost CostII1 =
3268 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3269
3270 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3271 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3272 if (!ShuffleDstTy || !II0Ty)
3273 return false;
3274
3275 if (!isTriviallyVectorizable(IID))
3276 return false;
3277
3278 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3280 II0->getArgOperand(I) != II1->getArgOperand(I))
3281 return false;
3282
3283 InstructionCost OldCost =
3284 CostII0 + CostII1 +
3286 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3287
3288 SmallVector<Type *> NewArgsTy;
3289 InstructionCost NewCost = 0;
3290 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3291 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3293 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3294 } else {
3295 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3296 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3297 ShuffleDstTy->getNumElements());
3298 NewArgsTy.push_back(ArgTy);
3299 std::pair<Value *, Value *> OperandPair =
3300 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3301 if (!SeenOperandPairs.insert(OperandPair).second) {
3302 // We've already computed the cost for this operand pair.
3303 continue;
3304 }
3305 NewCost += TTI.getShuffleCost(
3306 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3307 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3308 }
3309 }
3310 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3311
3312 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3313 if (!II0->hasOneUse())
3314 NewCost += CostII0;
3315 if (II1 != II0 && !II1->hasOneUse())
3316 NewCost += CostII1;
3317
3318 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3319 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3320 << "\n");
3321
3322 if (NewCost > OldCost)
3323 return false;
3324
3325 SmallVector<Value *> NewArgs;
3326 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3327 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3329 NewArgs.push_back(II0->getArgOperand(I));
3330 } else {
3331 std::pair<Value *, Value *> OperandPair =
3332 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3333 auto It = ShuffleCache.find(OperandPair);
3334 if (It != ShuffleCache.end()) {
3335 // Reuse previously created shuffle for this operand pair.
3336 NewArgs.push_back(It->second);
3337 continue;
3338 }
3339 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3340 II1->getArgOperand(I), OldMask);
3341 ShuffleCache[OperandPair] = Shuf;
3342 NewArgs.push_back(Shuf);
3343 Worklist.pushValue(Shuf);
3344 }
3345 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3346
3347 // Intersect flags from the old intrinsics.
3348 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3349 NewInst->copyIRFlags(II0);
3350 NewInst->andIRFlags(II1);
3351 }
3352
3353 replaceValue(I, *NewIntrinsic);
3354 return true;
3355}
3356
3357/// Try to convert
3358/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3359bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3360 Value *V0;
3361 ArrayRef<int> Mask;
3362 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3363 return false;
3364
3365 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3366 if (!II0)
3367 return false;
3368
3369 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3370 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3371 if (!ShuffleDstTy || !IntrinsicSrcTy)
3372 return false;
3373
3374 // Validate it's a pure permute, mask should only reference the first vector
3375 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3376 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3377 return false;
3378
3379 Intrinsic::ID IID = II0->getIntrinsicID();
3380 if (!isTriviallyVectorizable(IID))
3381 return false;
3382
3383 // Cost analysis
3385 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3386 InstructionCost OldCost =
3389 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3390
3391 SmallVector<Type *> NewArgsTy;
3392 InstructionCost NewCost = 0;
3393 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3395 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3396 } else {
3397 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3398 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3399 ShuffleDstTy->getNumElements());
3400 NewArgsTy.push_back(ArgTy);
3402 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3403 {II0->getArgOperand(I)});
3404 }
3405 }
3406 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3407 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3408
3409 // If the intrinsic has multiple uses, we need to account for the cost of
3410 // keeping the original intrinsic around.
3411 if (!II0->hasOneUse())
3412 NewCost += IntrinsicCost;
3413
3414 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3415 << OldCost << " vs NewCost: " << NewCost << "\n");
3416
3417 if (NewCost > OldCost)
3418 return false;
3419
3420 // Transform
3421 SmallVector<Value *> NewArgs;
3422 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3424 NewArgs.push_back(II0->getArgOperand(I));
3425 } else {
3426 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3427 NewArgs.push_back(Shuf);
3428 Worklist.pushValue(Shuf);
3429 }
3430 }
3431
3432 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3433
3434 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3435 NewInst->copyIRFlags(II0);
3436
3437 replaceValue(I, *NewIntrinsic);
3438 return true;
3439}
3440
3441using InstLane = std::pair<Use *, int>;
3442
3443static InstLane lookThroughShuffles(Use *U, int Lane) {
3444 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3445 unsigned NumElts =
3446 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3447 int M = SV->getMaskValue(Lane);
3448 if (M < 0)
3449 return {nullptr, PoisonMaskElem};
3450 if (static_cast<unsigned>(M) < NumElts) {
3451 U = &SV->getOperandUse(0);
3452 Lane = M;
3453 } else {
3454 U = &SV->getOperandUse(1);
3455 Lane = M - NumElts;
3456 }
3457 }
3458 return InstLane{U, Lane};
3459}
3460
3464 for (InstLane IL : Item) {
3465 auto [U, Lane] = IL;
3466 InstLane OpLane =
3467 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3468 Lane)
3469 : InstLane{nullptr, PoisonMaskElem};
3470 NItem.emplace_back(OpLane);
3471 }
3472 return NItem;
3473}
3474
3475/// Detect concat of multiple values into a vector
3477 const TargetTransformInfo &TTI) {
3478 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3479 unsigned NumElts = Ty->getNumElements();
3480 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3481 return false;
3482
3483 // Check that the concat is free, usually meaning that the type will be split
3484 // during legalization.
3485 SmallVector<int, 16> ConcatMask(NumElts * 2);
3486 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3487 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3488 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3489 Ty, ConcatMask, CostKind) != 0)
3490 return false;
3491
3492 unsigned NumSlices = Item.size() / NumElts;
3493 // Currently we generate a tree of shuffles for the concats, which limits us
3494 // to a power2.
3495 if (!isPowerOf2_32(NumSlices))
3496 return false;
3497 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3498 Use *SliceV = Item[Slice * NumElts].first;
3499 if (!SliceV || SliceV->get()->getType() != Ty)
3500 return false;
3501 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3502 auto [V, Lane] = Item[Slice * NumElts + Elt];
3503 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3504 return false;
3505 }
3506 }
3507 return true;
3508}
3509
3511 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3512 const SmallPtrSet<Use *, 4> &SplatLeafs,
3513 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3514 IRBuilderBase &Builder,
3515 const TargetTransformInfo *TTI) {
3516 auto [FrontU, FrontLane] = Item.front();
3517
3518 if (IdentityLeafs.contains(FrontU)) {
3519 return FrontU->get();
3520 }
3521 if (SplatLeafs.contains(FrontU)) {
3522 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3523 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3524 }
3525 if (ConcatLeafs.contains(FrontU)) {
3526 unsigned NumElts =
3527 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3528 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3529 for (unsigned S = 0; S < Values.size(); ++S)
3530 Values[S] = Item[S * NumElts].first->get();
3531
3532 while (Values.size() > 1) {
3533 NumElts *= 2;
3534 SmallVector<int, 16> Mask(NumElts, 0);
3535 std::iota(Mask.begin(), Mask.end(), 0);
3536 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3537 for (unsigned S = 0; S < NewValues.size(); ++S)
3538 NewValues[S] =
3539 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3540 Values = NewValues;
3541 }
3542 return Values[0];
3543 }
3544
3545 auto *I = cast<Instruction>(FrontU->get());
3546 auto *II = dyn_cast<IntrinsicInst>(I);
3547 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3549 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3550 if (II &&
3551 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3552 Ops[Idx] = II->getOperand(Idx);
3553 continue;
3554 }
3556 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3557 Builder, TTI);
3558 }
3559
3560 SmallVector<Value *, 8> ValueList;
3561 for (const auto &Lane : Item)
3562 if (Lane.first)
3563 ValueList.push_back(Lane.first->get());
3564
3565 Type *DstTy =
3566 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3567 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3568 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3569 Ops[0], Ops[1]);
3570 propagateIRFlags(Value, ValueList);
3571 return Value;
3572 }
3573 if (auto *CI = dyn_cast<CmpInst>(I)) {
3574 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3575 propagateIRFlags(Value, ValueList);
3576 return Value;
3577 }
3578 if (auto *SI = dyn_cast<SelectInst>(I)) {
3579 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3580 propagateIRFlags(Value, ValueList);
3581 return Value;
3582 }
3583 if (auto *CI = dyn_cast<CastInst>(I)) {
3584 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3585 propagateIRFlags(Value, ValueList);
3586 return Value;
3587 }
3588 if (II) {
3589 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3590 propagateIRFlags(Value, ValueList);
3591 return Value;
3592 }
3593 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3594 auto *Value =
3595 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3596 propagateIRFlags(Value, ValueList);
3597 return Value;
3598}
3599
3600// Starting from a shuffle, look up through operands tracking the shuffled index
3601// of each lane. If we can simplify away the shuffles to identities then
3602// do so.
3603bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3604 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3605 if (!Ty || I.use_empty())
3606 return false;
3607
3608 SmallVector<InstLane> Start(Ty->getNumElements());
3609 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3610 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3611
3613 Worklist.push_back(Start);
3614 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3615 unsigned NumVisited = 0;
3616
3617 while (!Worklist.empty()) {
3618 if (++NumVisited > MaxInstrsToScan)
3619 return false;
3620
3621 SmallVector<InstLane> Item = Worklist.pop_back_val();
3622 auto [FrontU, FrontLane] = Item.front();
3623
3624 // If we found an undef first lane then bail out to keep things simple.
3625 if (!FrontU)
3626 return false;
3627
3628 // Helper to peek through bitcasts to the same value.
3629 auto IsEquiv = [&](Value *X, Value *Y) {
3630 return X->getType() == Y->getType() &&
3632 };
3633
3634 // Look for an identity value.
3635 if (FrontLane == 0 &&
3636 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3637 Ty->getNumElements() &&
3638 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3639 Value *FrontV = Item.front().first->get();
3640 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3641 E.value().second == (int)E.index());
3642 })) {
3643 IdentityLeafs.insert(FrontU);
3644 continue;
3645 }
3646 // Look for constants, for the moment only supporting constant splats.
3647 if (auto *C = dyn_cast<Constant>(FrontU);
3648 C && C->getSplatValue() &&
3649 all_of(drop_begin(Item), [Item](InstLane &IL) {
3650 Value *FrontV = Item.front().first->get();
3651 Use *U = IL.first;
3652 return !U || (isa<Constant>(U->get()) &&
3653 cast<Constant>(U->get())->getSplatValue() ==
3654 cast<Constant>(FrontV)->getSplatValue());
3655 })) {
3656 SplatLeafs.insert(FrontU);
3657 continue;
3658 }
3659 // Look for a splat value.
3660 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3661 auto [FrontU, FrontLane] = Item.front();
3662 auto [U, Lane] = IL;
3663 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3664 })) {
3665 SplatLeafs.insert(FrontU);
3666 continue;
3667 }
3668
3669 // We need each element to be the same type of value, and check that each
3670 // element has a single use.
3671 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3672 Value *FrontV = Item.front().first->get();
3673 if (!IL.first)
3674 return true;
3675 Value *V = IL.first->get();
3676 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3677 return false;
3678 if (V->getValueID() != FrontV->getValueID())
3679 return false;
3680 if (auto *CI = dyn_cast<CmpInst>(V))
3681 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3682 return false;
3683 if (auto *CI = dyn_cast<CastInst>(V))
3684 if (CI->getSrcTy()->getScalarType() !=
3685 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3686 return false;
3687 if (auto *SI = dyn_cast<SelectInst>(V))
3688 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3689 SI->getOperand(0)->getType() !=
3690 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3691 return false;
3692 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3693 return false;
3694 auto *II = dyn_cast<IntrinsicInst>(V);
3695 return !II || (isa<IntrinsicInst>(FrontV) &&
3696 II->getIntrinsicID() ==
3697 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3698 !II->hasOperandBundles());
3699 };
3700 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3701 // Check the operator is one that we support.
3702 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3703 // We exclude div/rem in case they hit UB from poison lanes.
3704 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3705 BO && BO->isIntDivRem())
3706 return false;
3709 continue;
3710 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3711 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3713 continue;
3714 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3715 // TODO: Handle vector widening/narrowing bitcasts.
3716 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3717 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3718 if (DstTy && SrcTy &&
3719 SrcTy->getNumElements() == DstTy->getNumElements()) {
3721 continue;
3722 }
3723 } else if (isa<SelectInst>(FrontU)) {
3727 continue;
3728 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3729 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3730 !II->hasOperandBundles()) {
3731 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3732 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3733 &TTI)) {
3734 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3735 Value *FrontV = Item.front().first->get();
3736 Use *U = IL.first;
3737 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3738 cast<Instruction>(FrontV)->getOperand(Op));
3739 }))
3740 return false;
3741 continue;
3742 }
3744 }
3745 continue;
3746 }
3747 }
3748
3749 if (isFreeConcat(Item, CostKind, TTI)) {
3750 ConcatLeafs.insert(FrontU);
3751 continue;
3752 }
3753
3754 return false;
3755 }
3756
3757 if (NumVisited <= 1)
3758 return false;
3759
3760 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3761
3762 // If we got this far, we know the shuffles are superfluous and can be
3763 // removed. Scan through again and generate the new tree of instructions.
3764 Builder.SetInsertPoint(&I);
3765 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3766 ConcatLeafs, Builder, &TTI);
3767 replaceValue(I, *V);
3768 return true;
3769}
3770
3771/// Given a commutative reduction, the order of the input lanes does not alter
3772/// the results. We can use this to remove certain shuffles feeding the
3773/// reduction, removing the need to shuffle at all.
3774bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3775 auto *II = dyn_cast<IntrinsicInst>(&I);
3776 if (!II)
3777 return false;
3778 switch (II->getIntrinsicID()) {
3779 case Intrinsic::vector_reduce_add:
3780 case Intrinsic::vector_reduce_mul:
3781 case Intrinsic::vector_reduce_and:
3782 case Intrinsic::vector_reduce_or:
3783 case Intrinsic::vector_reduce_xor:
3784 case Intrinsic::vector_reduce_smin:
3785 case Intrinsic::vector_reduce_smax:
3786 case Intrinsic::vector_reduce_umin:
3787 case Intrinsic::vector_reduce_umax:
3788 break;
3789 default:
3790 return false;
3791 }
3792
3793 // Find all the inputs when looking through operations that do not alter the
3794 // lane order (binops, for example). Currently we look for a single shuffle,
3795 // and can ignore splat values.
3796 std::queue<Value *> Worklist;
3797 SmallPtrSet<Value *, 4> Visited;
3798 ShuffleVectorInst *Shuffle = nullptr;
3799 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3800 Worklist.push(Op);
3801
3802 while (!Worklist.empty()) {
3803 Value *CV = Worklist.front();
3804 Worklist.pop();
3805 if (Visited.contains(CV))
3806 continue;
3807
3808 // Splats don't change the order, so can be safely ignored.
3809 if (isSplatValue(CV))
3810 continue;
3811
3812 Visited.insert(CV);
3813
3814 if (auto *CI = dyn_cast<Instruction>(CV)) {
3815 if (CI->isBinaryOp()) {
3816 for (auto *Op : CI->operand_values())
3817 Worklist.push(Op);
3818 continue;
3819 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3820 if (Shuffle && Shuffle != SV)
3821 return false;
3822 Shuffle = SV;
3823 continue;
3824 }
3825 }
3826
3827 // Anything else is currently an unknown node.
3828 return false;
3829 }
3830
3831 if (!Shuffle)
3832 return false;
3833
3834 // Check all uses of the binary ops and shuffles are also included in the
3835 // lane-invariant operations (Visited should be the list of lanewise
3836 // instructions, including the shuffle that we found).
3837 for (auto *V : Visited)
3838 for (auto *U : V->users())
3839 if (!Visited.contains(U) && U != &I)
3840 return false;
3841
3842 FixedVectorType *VecType =
3843 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3844 if (!VecType)
3845 return false;
3846 FixedVectorType *ShuffleInputType =
3848 if (!ShuffleInputType)
3849 return false;
3850 unsigned NumInputElts = ShuffleInputType->getNumElements();
3851
3852 // Find the mask from sorting the lanes into order. This is most likely to
3853 // become a identity or concat mask. Undef elements are pushed to the end.
3854 SmallVector<int> ConcatMask;
3855 Shuffle->getShuffleMask(ConcatMask);
3856 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3857 bool UsesSecondVec =
3858 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3859
3861 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3862 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3864 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3865 ShuffleInputType, ConcatMask, CostKind);
3866
3867 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3868 << "\n");
3869 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3870 << "\n");
3871 bool MadeChanges = false;
3872 if (NewCost < OldCost) {
3873 Builder.SetInsertPoint(Shuffle);
3874 Value *NewShuffle = Builder.CreateShuffleVector(
3875 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3876 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3877 replaceValue(*Shuffle, *NewShuffle);
3878 return true;
3879 }
3880
3881 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3882 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3883 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3884 return MadeChanges;
3885}
3886
3887/// For a given chain of patterns of the following form:
3888///
3889/// ```
3890/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3891///
3892/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3893/// ty1> %1)
3894/// OR
3895/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3896///
3897/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3898/// ...
3899/// ...
3900/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3901/// 3), <n x ty1> %(i - 2)
3902/// OR
3903/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3904///
3905/// %(i) = extractelement <n x ty1> %(i - 1), 0
3906/// ```
3907///
3908/// Where:
3909/// `mask` follows a partition pattern:
3910///
3911/// Ex:
3912/// [n = 8, p = poison]
3913///
3914/// 4 5 6 7 | p p p p
3915/// 2 3 | p p p p p p
3916/// 1 | p p p p p p p
3917///
3918/// For powers of 2, there's a consistent pattern, but for other cases
3919/// the parity of the current half value at each step decides the
3920/// next partition half (see `ExpectedParityMask` for more logical details
3921/// in generalising this).
3922///
3923/// Ex:
3924/// [n = 6]
3925///
3926/// 3 4 5 | p p p
3927/// 1 2 | p p p p
3928/// 1 | p p p p p
3929bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3930 // Going bottom-up for the pattern.
3931 std::queue<Value *> InstWorklist;
3932 InstructionCost OrigCost = 0;
3933
3934 // Common instruction operation after each shuffle op.
3935 std::optional<unsigned int> CommonCallOp = std::nullopt;
3936 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3937
3938 bool IsFirstCallOrBinInst = true;
3939 bool ShouldBeCallOrBinInst = true;
3940
3941 // This stores the last used instructions for shuffle/common op.
3942 //
3943 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3944 // instructions from either shuffle/common op.
3945 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3946
3947 Value *VecOpEE;
3948 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3949 return false;
3950
3951 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3952 if (!FVT)
3953 return false;
3954
3955 int64_t VecSize = FVT->getNumElements();
3956 if (VecSize < 2)
3957 return false;
3958
3959 // Number of levels would be ~log2(n), considering we always partition
3960 // by half for this fold pattern.
3961 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3962 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3963
3964 // This is how we generalise for all element sizes.
3965 // At each step, if vector size is odd, we need non-poison
3966 // values to cover the dominant half so we don't miss out on any element.
3967 //
3968 // This mask will help us retrieve this as we go from bottom to top:
3969 //
3970 // Mask Set -> N = N * 2 - 1
3971 // Mask Unset -> N = N * 2
3972 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3973 Cur = (Cur + 1) / 2, --Mask) {
3974 if (Cur & 1)
3975 ExpectedParityMask |= (1ll << Mask);
3976 }
3977
3978 InstWorklist.push(VecOpEE);
3979
3980 while (!InstWorklist.empty()) {
3981 Value *CI = InstWorklist.front();
3982 InstWorklist.pop();
3983
3984 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3985 if (!ShouldBeCallOrBinInst)
3986 return false;
3987
3988 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
3989 return false;
3990
3991 // For the first found call/bin op, the vector has to come from the
3992 // extract element op.
3993 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3994 return false;
3995 IsFirstCallOrBinInst = false;
3996
3997 if (!CommonCallOp)
3998 CommonCallOp = II->getIntrinsicID();
3999 if (II->getIntrinsicID() != *CommonCallOp)
4000 return false;
4001
4002 switch (II->getIntrinsicID()) {
4003 case Intrinsic::umin:
4004 case Intrinsic::umax:
4005 case Intrinsic::smin:
4006 case Intrinsic::smax: {
4007 auto *Op0 = II->getOperand(0);
4008 auto *Op1 = II->getOperand(1);
4009 PrevVecV[0] = Op0;
4010 PrevVecV[1] = Op1;
4011 break;
4012 }
4013 default:
4014 return false;
4015 }
4016 ShouldBeCallOrBinInst ^= 1;
4017
4018 IntrinsicCostAttributes ICA(
4019 *CommonCallOp, II->getType(),
4020 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
4021 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4022
4023 // We may need a swap here since it can be (a, b) or (b, a)
4024 // and accordingly change as we go up.
4025 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4026 std::swap(PrevVecV[0], PrevVecV[1]);
4027 InstWorklist.push(PrevVecV[1]);
4028 InstWorklist.push(PrevVecV[0]);
4029 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
4030 // Similar logic for bin ops.
4031
4032 if (!ShouldBeCallOrBinInst)
4033 return false;
4034
4035 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4036 return false;
4037
4038 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4039 return false;
4040 IsFirstCallOrBinInst = false;
4041
4042 if (!CommonBinOp)
4043 CommonBinOp = BinOp->getOpcode();
4044
4045 if (BinOp->getOpcode() != *CommonBinOp)
4046 return false;
4047
4048 switch (*CommonBinOp) {
4049 case BinaryOperator::Add:
4050 case BinaryOperator::Mul:
4051 case BinaryOperator::Or:
4052 case BinaryOperator::And:
4053 case BinaryOperator::Xor: {
4054 auto *Op0 = BinOp->getOperand(0);
4055 auto *Op1 = BinOp->getOperand(1);
4056 PrevVecV[0] = Op0;
4057 PrevVecV[1] = Op1;
4058 break;
4059 }
4060 default:
4061 return false;
4062 }
4063 ShouldBeCallOrBinInst ^= 1;
4064
4065 OrigCost +=
4066 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
4067
4068 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4069 std::swap(PrevVecV[0], PrevVecV[1]);
4070 InstWorklist.push(PrevVecV[1]);
4071 InstWorklist.push(PrevVecV[0]);
4072 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
4073 // We shouldn't have any null values in the previous vectors,
4074 // is so, there was a mismatch in pattern.
4075 if (ShouldBeCallOrBinInst || any_of(PrevVecV, equal_to(nullptr)))
4076 return false;
4077
4078 if (SVInst != PrevVecV[1])
4079 return false;
4080
4081 ArrayRef<int> CurMask;
4082 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
4083 m_Mask(CurMask))))
4084 return false;
4085
4086 // Subtract the parity mask when checking the condition.
4087 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
4088 if (Mask < ShuffleMaskHalf &&
4089 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
4090 return false;
4091 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
4092 return false;
4093 }
4094
4095 // Update mask values.
4096 ShuffleMaskHalf *= 2;
4097 ShuffleMaskHalf -= (ExpectedParityMask & 1);
4098 ExpectedParityMask >>= 1;
4099
4101 SVInst->getType(), SVInst->getType(),
4102 CurMask, CostKind);
4103
4104 VisitedCnt += 1;
4105 if (!ExpectedParityMask && VisitedCnt == NumLevels)
4106 break;
4107
4108 ShouldBeCallOrBinInst ^= 1;
4109 } else {
4110 return false;
4111 }
4112 }
4113
4114 // Pattern should end with a shuffle op.
4115 if (ShouldBeCallOrBinInst)
4116 return false;
4117
4118 assert(VecSize != -1 && "Expected Match for Vector Size");
4119
4120 Value *FinalVecV = PrevVecV[0];
4121 if (!FinalVecV)
4122 return false;
4123
4124 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
4125
4126 Intrinsic::ID ReducedOp =
4127 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4128 : getReductionForBinop(*CommonBinOp));
4129 if (!ReducedOp)
4130 return false;
4131
4132 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
4134
4135 if (NewCost >= OrigCost)
4136 return false;
4137
4138 auto *ReducedResult =
4139 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
4140 replaceValue(I, *ReducedResult);
4141
4142 return true;
4143}
4144
4145/// Determine if its more efficient to fold:
4146/// reduce(trunc(x)) -> trunc(reduce(x)).
4147/// reduce(sext(x)) -> sext(reduce(x)).
4148/// reduce(zext(x)) -> zext(reduce(x)).
4149bool VectorCombine::foldCastFromReductions(Instruction &I) {
4150 auto *II = dyn_cast<IntrinsicInst>(&I);
4151 if (!II)
4152 return false;
4153
4154 bool TruncOnly = false;
4155 Intrinsic::ID IID = II->getIntrinsicID();
4156 switch (IID) {
4157 case Intrinsic::vector_reduce_add:
4158 case Intrinsic::vector_reduce_mul:
4159 TruncOnly = true;
4160 break;
4161 case Intrinsic::vector_reduce_and:
4162 case Intrinsic::vector_reduce_or:
4163 case Intrinsic::vector_reduce_xor:
4164 break;
4165 default:
4166 return false;
4167 }
4168
4169 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4170 Value *ReductionSrc = I.getOperand(0);
4171
4172 Value *Src;
4173 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4174 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4175 return false;
4176
4177 auto CastOpc =
4178 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4179
4180 auto *SrcTy = cast<VectorType>(Src->getType());
4181 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4182 Type *ResultTy = I.getType();
4183
4185 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4186 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4188 cast<CastInst>(ReductionSrc));
4189 InstructionCost NewCost =
4190 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4191 CostKind) +
4192 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4194
4195 if (OldCost <= NewCost || !NewCost.isValid())
4196 return false;
4197
4198 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4199 II->getIntrinsicID(), {Src});
4200 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4201 replaceValue(I, *NewCast);
4202 return true;
4203}
4204
4205/// Fold:
4206/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4207/// into:
4208/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4209///
4210/// Sign-bit reductions produce values with known semantics:
4211/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4212/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4213/// - reduce.add: count of negative elements (0 to NumElts)
4214///
4215/// Both lshr and ashr are supported:
4216/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4217/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4218///
4219/// The fold generalizes to multiple source vectors combined with the same
4220/// operation as the reduction. For example:
4221/// reduce.or(or(shr A, shr B)) conceptually extends the vector
4222/// For reduce.add, this changes the count to M*N where M is the number of
4223/// source vectors.
4224///
4225/// We transform to a direct sign check on the original vector using
4226/// reduce.{or,umax} or reduce.{and,umin}.
4227///
4228/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4229bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4230 CmpPredicate Pred;
4231 IntrinsicInst *ReduceOp;
4232 const APInt *CmpVal;
4233 if (!match(&I,
4234 m_ICmp(Pred, m_OneUse(m_AnyIntrinsic(ReduceOp)), m_APInt(CmpVal))))
4235 return false;
4236
4237 Intrinsic::ID OrigIID = ReduceOp->getIntrinsicID();
4238 switch (OrigIID) {
4239 case Intrinsic::vector_reduce_or:
4240 case Intrinsic::vector_reduce_umax:
4241 case Intrinsic::vector_reduce_and:
4242 case Intrinsic::vector_reduce_umin:
4243 case Intrinsic::vector_reduce_add:
4244 break;
4245 default:
4246 return false;
4247 }
4248
4249 Value *ReductionSrc = ReduceOp->getArgOperand(0);
4250 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4251 if (!VecTy)
4252 return false;
4253
4254 unsigned BitWidth = VecTy->getScalarSizeInBits();
4255 if (BitWidth == 1)
4256 return false;
4257
4258 unsigned NumElts = VecTy->getNumElements();
4259
4260 // Determine the expected tree opcode for multi-vector patterns.
4261 // The tree opcode must match the reduction's underlying operation.
4262 //
4263 // TODO: for pairs of equivalent operators, we should match both,
4264 // not only the most common.
4265 Instruction::BinaryOps TreeOpcode;
4266 switch (OrigIID) {
4267 case Intrinsic::vector_reduce_or:
4268 case Intrinsic::vector_reduce_umax:
4269 TreeOpcode = Instruction::Or;
4270 break;
4271 case Intrinsic::vector_reduce_and:
4272 case Intrinsic::vector_reduce_umin:
4273 TreeOpcode = Instruction::And;
4274 break;
4275 case Intrinsic::vector_reduce_add:
4276 TreeOpcode = Instruction::Add;
4277 break;
4278 default:
4279 llvm_unreachable("Unexpected intrinsic");
4280 }
4281
4282 // Collect sign-bit extraction leaves from an associative tree of TreeOpcode.
4283 // The tree conceptually extends the vector being reduced.
4284 SmallVector<Value *, 8> Worklist;
4285 SmallVector<Value *, 8> Sources; // Original vectors (X in shr X, BW-1)
4286 Worklist.push_back(ReductionSrc);
4287 std::optional<bool> IsAShr;
4288 constexpr unsigned MaxSources = 8;
4289
4290 // Calculate old cost: all shifts + tree ops + reduction
4291 InstructionCost OldCost = TTI.getInstructionCost(ReduceOp, CostKind);
4292
4293 while (!Worklist.empty() && Worklist.size() <= MaxSources &&
4294 Sources.size() <= MaxSources) {
4295 Value *V = Worklist.pop_back_val();
4296
4297 // Try to match sign-bit extraction: shr X, (bitwidth-1)
4298 Value *X;
4299 if (match(V, m_OneUse(m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))) {
4300 auto *Shr = cast<Instruction>(V);
4301
4302 // All shifts must be the same type (all lshr or all ashr)
4303 bool ThisIsAShr = Shr->getOpcode() == Instruction::AShr;
4304 if (!IsAShr)
4305 IsAShr = ThisIsAShr;
4306 else if (*IsAShr != ThisIsAShr)
4307 return false;
4308
4309 Sources.push_back(X);
4310
4311 // As part of the fold, we remove all of the shifts, so we need to keep
4312 // track of their costs.
4313 OldCost += TTI.getInstructionCost(Shr, CostKind);
4314
4315 continue;
4316 }
4317
4318 // Try to extend through a tree node of the expected opcode
4319 Value *A, *B;
4320 if (!match(V, m_OneUse(m_BinOp(TreeOpcode, m_Value(A), m_Value(B)))))
4321 return false;
4322
4323 // We are potentially replacing these operations as well, so we add them
4324 // to the costs.
4326
4327 Worklist.push_back(A);
4328 Worklist.push_back(B);
4329 }
4330
4331 // Must have at least one source and not exceed limit
4332 if (Sources.empty() || Sources.size() > MaxSources ||
4333 Worklist.size() > MaxSources || !IsAShr)
4334 return false;
4335
4336 unsigned NumSources = Sources.size();
4337
4338 // For reduce.add, the total count must fit as a signed integer.
4339 // Range is [0, M*N] for lshr or [-M*N, 0] for ashr.
4340 if (OrigIID == Intrinsic::vector_reduce_add &&
4341 !isIntN(BitWidth, NumSources * NumElts))
4342 return false;
4343
4344 // Compute the boundary value when all elements are negative:
4345 // - Per-element contribution: 1 for lshr, -1 for ashr
4346 // - For add: M*N (total elements across all sources); for others: just 1
4347 unsigned Count =
4348 (OrigIID == Intrinsic::vector_reduce_add) ? NumSources * NumElts : 1;
4349 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4350 if (*IsAShr)
4351 NegativeVal.negate();
4352
4353 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4354 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4355 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4356 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4357
4358 // Determine comparison semantics:
4359 // - IsEq: true for equality test, false for inequality
4360 // - TestsNegative: true if testing against AllNegVal, false for zero
4361 //
4362 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4363 // that fold to boundary tests given the narrow value range:
4364 // < RangeHigh -> != RangeHigh
4365 // > RangeHigh-1 -> == RangeHigh
4366 // > RangeLow -> != RangeLow
4367 // < RangeLow+1 -> == RangeLow
4368 //
4369 // For inequalities, we work with signed predicates only. Unsigned predicates
4370 // are canonicalized to signed when the range is non-negative (where they are
4371 // equivalent). When the range includes negative values, unsigned predicates
4372 // would have different semantics due to wrap-around, so we reject them.
4373 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4374 if (RangeLow.isNegative())
4375 return false;
4376 Pred = ICmpInst::getSignedPredicate(Pred);
4377 }
4378
4379 bool IsEq;
4380 bool TestsNegative;
4381 if (ICmpInst::isEquality(Pred)) {
4382 if (CmpVal->isZero()) {
4383 TestsNegative = false;
4384 } else if (*CmpVal == NegativeVal) {
4385 TestsNegative = true;
4386 } else {
4387 return false;
4388 }
4389 IsEq = Pred == ICmpInst::ICMP_EQ;
4390 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4391 IsEq = false;
4392 TestsNegative = (RangeHigh == NegativeVal);
4393 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4394 IsEq = true;
4395 TestsNegative = (RangeHigh == NegativeVal);
4396 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4397 IsEq = false;
4398 TestsNegative = (RangeLow == NegativeVal);
4399 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4400 IsEq = true;
4401 TestsNegative = (RangeLow == NegativeVal);
4402 } else {
4403 return false;
4404 }
4405
4406 // For this fold we support four types of checks:
4407 //
4408 // 1. All lanes are negative - AllNeg
4409 // 2. All lanes are non-negative - AllNonNeg
4410 // 3. At least one negative lane - AnyNeg
4411 // 4. At least one non-negative lane - AnyNonNeg
4412 //
4413 // For each case, we can generate the following code:
4414 //
4415 // 1. AllNeg - reduce.and/umin(X) < 0
4416 // 2. AllNonNeg - reduce.or/umax(X) > -1
4417 // 3. AnyNeg - reduce.or/umax(X) < 0
4418 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4419 //
4420 // The table below shows the aggregation of all supported cases
4421 // using these four cases.
4422 //
4423 // Reduction | == 0 | != 0 | == MAX | != MAX
4424 // ------------+-----------+-----------+-----------+-----------
4425 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4426 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4427 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4428 //
4429 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4430 //
4431 // For easier codegen and check inversion, we use the following encoding:
4432 //
4433 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4434 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4435 // 3. Bit-1 === universal (1) or existential (0) check
4436 //
4437 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4438 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4439 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4440 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4441 //
4442 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4443 //
4444 enum CheckKind : unsigned {
4445 AnyNonNeg = 0b000,
4446 AllNeg = 0b011,
4447 AllNonNeg = 0b101,
4448 AnyNeg = 0b110,
4449 };
4450 // Return true if we fold this check into or/umax and false for and/umin
4451 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4452 // Return true if we should check if result is negative and false otherwise
4453 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4454 // Logically invert the check
4455 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4456
4457 CheckKind Base;
4458 switch (OrigIID) {
4459 case Intrinsic::vector_reduce_or:
4460 case Intrinsic::vector_reduce_umax:
4461 Base = TestsNegative ? AnyNeg : AllNonNeg;
4462 break;
4463 case Intrinsic::vector_reduce_and:
4464 case Intrinsic::vector_reduce_umin:
4465 Base = TestsNegative ? AllNeg : AnyNonNeg;
4466 break;
4467 case Intrinsic::vector_reduce_add:
4468 Base = TestsNegative ? AllNeg : AllNonNeg;
4469 break;
4470 default:
4471 llvm_unreachable("Unexpected intrinsic");
4472 }
4473
4474 CheckKind Check = IsEq ? Base : Invert(Base);
4475
4476 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4477 InstructionCost ArithCost =
4479 VecTy, std::nullopt, CostKind);
4480 InstructionCost MinMaxCost =
4482 FastMathFlags(), CostKind);
4483 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4484 : std::make_pair(MinMax, MinMaxCost);
4485 };
4486
4487 // Choose output reduction based on encoding's MSB
4488 auto [NewIID, NewCost] = RequiresOr(Check)
4489 ? PickCheaper(Intrinsic::vector_reduce_or,
4490 Intrinsic::vector_reduce_umax)
4491 : PickCheaper(Intrinsic::vector_reduce_and,
4492 Intrinsic::vector_reduce_umin);
4493
4494 // Add cost of combining multiple sources with or/and
4495 if (NumSources > 1) {
4496 unsigned CombineOpc =
4497 RequiresOr(Check) ? Instruction::Or : Instruction::And;
4498 NewCost += TTI.getArithmeticInstrCost(CombineOpc, VecTy, CostKind) *
4499 (NumSources - 1);
4500 }
4501
4502 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4503 << OldCost << " vs NewCost: " << NewCost << "\n");
4504
4505 if (NewCost > OldCost)
4506 return false;
4507
4508 // Generate the combined input and reduction
4509 Builder.SetInsertPoint(&I);
4510 Type *ScalarTy = VecTy->getScalarType();
4511
4512 Value *Input;
4513 if (NumSources == 1) {
4514 Input = Sources[0];
4515 } else {
4516 // Combine sources with or/and based on check type
4517 Input = RequiresOr(Check) ? Builder.CreateOr(Sources)
4518 : Builder.CreateAnd(Sources);
4519 }
4520
4521 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {Input});
4522 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4523 : Builder.CreateIsNotNeg(NewReduce);
4524 replaceValue(I, *NewCmp);
4525 return true;
4526}
4527
4528/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4529///
4530/// We can prove it for cases when:
4531///
4532/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4533/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4534/// 2. f(x) == 0 <=> x == 0
4535///
4536/// From 1 and 2 (or 1' and 2), we can infer that
4537///
4538/// OP f(X_i) == 0 <=> OP X_i == 0.
4539///
4540/// (1)
4541/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4542/// (2)
4543/// <=> \forall i \in [1, N] X_i == 0
4544/// (1)
4545/// <=> OP(X_i) == 0
4546///
4547/// For some of the OP's and f's, we need to have domain constraints on X
4548/// to ensure properties 1 (or 1') and 2.
4549bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4550 CmpPredicate Pred;
4551 Value *Op;
4552 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4553 !ICmpInst::isEquality(Pred))
4554 return false;
4555
4556 auto *II = dyn_cast<IntrinsicInst>(Op);
4557 if (!II)
4558 return false;
4559
4560 switch (II->getIntrinsicID()) {
4561 case Intrinsic::vector_reduce_add:
4562 case Intrinsic::vector_reduce_or:
4563 case Intrinsic::vector_reduce_umin:
4564 case Intrinsic::vector_reduce_umax:
4565 case Intrinsic::vector_reduce_smin:
4566 case Intrinsic::vector_reduce_smax:
4567 break;
4568 default:
4569 return false;
4570 }
4571
4572 Value *InnerOp = II->getArgOperand(0);
4573
4574 // TODO: fixed vector type might be too restrictive
4575 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4576 return false;
4577
4578 Value *X = nullptr;
4579
4580 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4581 //
4582 // 1. f(x) = shl nuw x, y for arbitrary y
4583 // 2. f(x) = mul nuw x, c for defined c != 0
4584 // 3. f(x) = zext x
4585 // 4. f(x) = sext x
4586 // 5. f(x) = neg x
4587 //
4588 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4589 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4590 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4591 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4592 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4593 ))
4594 return false;
4595
4596 SimplifyQuery S = SQ.getWithInstruction(&I);
4597 auto *XTy = cast<FixedVectorType>(X->getType());
4598
4599 // Check for domain constraints for all supported reductions.
4600 //
4601 // a. OR X_i - has property 1 for every X
4602 // b. UMAX X_i - has property 1 for every X
4603 // c. UMIN X_i - has property 1' for every X
4604 // d. SMAX X_i - has property 1 for X >= 0
4605 // e. SMIN X_i - has property 1' for X >= 0
4606 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4607 //
4608 // In order for the proof to work, we need 1 (or 1') to be true for both
4609 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4610 //
4611 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4612 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4613 // of known bits, we can't reasonably hold knowledge of "either 0
4614 // or negative".
4615 switch (II->getIntrinsicID()) {
4616 case Intrinsic::vector_reduce_add: {
4617 // We need to check that both X_i and f(X_i) have enough leading
4618 // zeros to not overflow.
4619 KnownBits KnownX = computeKnownBits(X, S);
4620 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4621 unsigned NumElems = XTy->getNumElements();
4622 // Adding N elements loses at most ceil(log2(N)) leading bits.
4623 unsigned LostBits = Log2_32_Ceil(NumElems);
4624 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4625 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4626 // Need at least one leading zero left after summation to ensure no overflow
4627 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4628 return false;
4629
4630 // We are not checking whether X or f(X) are positive explicitly because
4631 // we implicitly checked for it when we checked if both cases have enough
4632 // leading zeros to not wrap addition.
4633 break;
4634 }
4635 case Intrinsic::vector_reduce_smin:
4636 case Intrinsic::vector_reduce_smax:
4637 // Check whether X >= 0 and f(X) >= 0
4638 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4639 return false;
4640
4641 break;
4642 default:
4643 break;
4644 };
4645
4646 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4647 << *II << "\n");
4648
4649 // For zext/sext, check if the transform is profitable using cost model.
4650 // For other operations (shl, mul, neg), we're removing an instruction
4651 // while keeping the same reduction type, so it's always profitable.
4652 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4653 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4654 Intrinsic::ID IID = II->getIntrinsicID();
4655
4657 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4659
4660 InstructionCost OldReduceCost, NewReduceCost;
4661 switch (IID) {
4662 case Intrinsic::vector_reduce_add:
4663 case Intrinsic::vector_reduce_or:
4664 OldReduceCost = TTI.getArithmeticReductionCost(
4665 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4666 NewReduceCost = TTI.getArithmeticReductionCost(
4667 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4668 break;
4669 case Intrinsic::vector_reduce_umin:
4670 case Intrinsic::vector_reduce_umax:
4671 case Intrinsic::vector_reduce_smin:
4672 case Intrinsic::vector_reduce_smax:
4673 OldReduceCost = TTI.getMinMaxReductionCost(
4674 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4675 NewReduceCost = TTI.getMinMaxReductionCost(
4676 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4677 break;
4678 default:
4679 llvm_unreachable("Unexpected reduction");
4680 }
4681
4682 InstructionCost OldCost = OldReduceCost + ExtCost;
4683 InstructionCost NewCost =
4684 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4685
4686 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4687 << *InnerOp << "\n OldCost: " << OldCost
4688 << " vs NewCost: " << NewCost << "\n");
4689
4690 // We consider transformation to still be potentially beneficial even
4691 // when the costs are the same because we might remove a use from f(X)
4692 // and unlock other optimizations. Equal costs would just mean that we
4693 // didn't make it worse in the worst case.
4694 if (NewCost > OldCost)
4695 return false;
4696 }
4697
4698 // Since we support zext and sext as f, we might change the scalar type
4699 // of the intrinsic.
4700 Type *Ty = XTy->getScalarType();
4701 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4702 Value *NewCmp =
4703 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4704 replaceValue(I, *NewCmp);
4705 return true;
4706}
4707
4708/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4709/// based on cost, preserving the comparison semantics.
4710///
4711/// We use two fundamental properties for each pair:
4712///
4713/// 1. or(X) == 0 <=> umax(X) == 0
4714/// 2. or(X) == 1 <=> umax(X) == 1
4715/// 3. sign(or(X)) == sign(umax(X))
4716///
4717/// 1. and(X) == -1 <=> umin(X) == -1
4718/// 2. and(X) == -2 <=> umin(X) == -2
4719/// 3. sign(and(X)) == sign(umin(X))
4720///
4721/// From these we can infer the following transformations:
4722/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4723/// b. or(X) s< 0 <-> umax(X) s< 0
4724/// c. or(X) s> -1 <-> umax(X) s> -1
4725/// d. or(X) s< 1 <-> umax(X) s< 1
4726/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4727/// f. or(X) s< 2 <-> umax(X) s< 2
4728/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4729/// h. and(X) s< 0 <-> umin(X) s< 0
4730/// i. and(X) s> -1 <-> umin(X) s> -1
4731/// j. and(X) s> -2 <-> umin(X) s> -2
4732/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4733/// l. and(X) s> -3 <-> umin(X) s> -3
4734///
4735bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4736 CmpPredicate Pred;
4737 Value *ReduceOp;
4738 const APInt *CmpVal;
4739 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4740 return false;
4741
4742 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4743 if (!II || !II->hasOneUse())
4744 return false;
4745
4746 const auto IsValidOrUmaxCmp = [&]() {
4747 // or === umax for i1
4748 if (CmpVal->getBitWidth() == 1)
4749 return true;
4750
4751 // Cases a and e
4752 bool IsEquality =
4753 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4754 // Case c
4755 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4756 // Cases b, d, and f
4757 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4758 Pred == ICmpInst::ICMP_SLT;
4759 return IsEquality || IsPositive || IsNegative;
4760 };
4761
4762 const auto IsValidAndUminCmp = [&]() {
4763 // and === umin for i1
4764 if (CmpVal->getBitWidth() == 1)
4765 return true;
4766
4767 const auto LeadingOnes = CmpVal->countl_one();
4768
4769 // Cases g and k
4770 bool IsEquality =
4771 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4773 // Case h
4774 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4775 // Cases i, j, and l
4776 bool IsPositive =
4777 // if the number has at least N - 2 leading ones
4778 // and the two LSBs are:
4779 // - 1 x 1 -> -1
4780 // - 1 x 0 -> -2
4781 // - 0 x 1 -> -3
4782 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4783 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4784 return IsEquality || IsNegative || IsPositive;
4785 };
4786
4787 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4788 Intrinsic::ID AlternativeIID;
4789
4790 // Check if this is a valid comparison pattern and determine the alternate
4791 // reduction intrinsic.
4792 switch (OriginalIID) {
4793 case Intrinsic::vector_reduce_or:
4794 if (!IsValidOrUmaxCmp())
4795 return false;
4796 AlternativeIID = Intrinsic::vector_reduce_umax;
4797 break;
4798 case Intrinsic::vector_reduce_umax:
4799 if (!IsValidOrUmaxCmp())
4800 return false;
4801 AlternativeIID = Intrinsic::vector_reduce_or;
4802 break;
4803 case Intrinsic::vector_reduce_and:
4804 if (!IsValidAndUminCmp())
4805 return false;
4806 AlternativeIID = Intrinsic::vector_reduce_umin;
4807 break;
4808 case Intrinsic::vector_reduce_umin:
4809 if (!IsValidAndUminCmp())
4810 return false;
4811 AlternativeIID = Intrinsic::vector_reduce_and;
4812 break;
4813 default:
4814 return false;
4815 }
4816
4817 Value *X = II->getArgOperand(0);
4818 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4819 if (!VecTy)
4820 return false;
4821
4822 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4823 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4824 if (ReductionOpc != Instruction::ICmp)
4825 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4826 CostKind);
4828 FastMathFlags(), CostKind);
4829 };
4830
4831 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4832 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4833
4834 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4835 << "\n OrigCost: " << OrigCost
4836 << " vs AltCost: " << AltCost << "\n");
4837
4838 if (AltCost >= OrigCost)
4839 return false;
4840
4841 Builder.SetInsertPoint(&I);
4842 Type *ScalarTy = VecTy->getScalarType();
4843 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
4844 Value *NewCmp =
4845 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
4846
4847 replaceValue(I, *NewCmp);
4848 return true;
4849}
4850
4851/// Returns true if this ShuffleVectorInst eventually feeds into a
4852/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4853/// chains of shuffles and binary operators (in any combination/order).
4854/// The search does not go deeper than the given Depth.
4856 constexpr unsigned MaxVisited = 32;
4859 bool FoundReduction = false;
4860
4861 WorkList.push_back(SVI);
4862 while (!WorkList.empty()) {
4863 Instruction *I = WorkList.pop_back_val();
4864 for (User *U : I->users()) {
4865 auto *UI = cast<Instruction>(U);
4866 if (!UI || !Visited.insert(UI).second)
4867 continue;
4868 if (Visited.size() > MaxVisited)
4869 return false;
4870 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4871 // More than one reduction reached
4872 if (FoundReduction)
4873 return false;
4874 switch (II->getIntrinsicID()) {
4875 case Intrinsic::vector_reduce_add:
4876 case Intrinsic::vector_reduce_mul:
4877 case Intrinsic::vector_reduce_and:
4878 case Intrinsic::vector_reduce_or:
4879 case Intrinsic::vector_reduce_xor:
4880 case Intrinsic::vector_reduce_smin:
4881 case Intrinsic::vector_reduce_smax:
4882 case Intrinsic::vector_reduce_umin:
4883 case Intrinsic::vector_reduce_umax:
4884 FoundReduction = true;
4885 continue;
4886 default:
4887 return false;
4888 }
4889 }
4890
4892 return false;
4893
4894 WorkList.emplace_back(UI);
4895 }
4896 }
4897 return FoundReduction;
4898}
4899
4900/// This method looks for groups of shuffles acting on binops, of the form:
4901/// %x = shuffle ...
4902/// %y = shuffle ...
4903/// %a = binop %x, %y
4904/// %b = binop %x, %y
4905/// shuffle %a, %b, selectmask
4906/// We may, especially if the shuffle is wider than legal, be able to convert
4907/// the shuffle to a form where only parts of a and b need to be computed. On
4908/// architectures with no obvious "select" shuffle, this can reduce the total
4909/// number of operations if the target reports them as cheaper.
4910bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4911 auto *SVI = cast<ShuffleVectorInst>(&I);
4912 auto *VT = cast<FixedVectorType>(I.getType());
4913 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4914 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4915 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4916 VT != Op0->getType())
4917 return false;
4918
4919 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4920 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4921 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4922 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4923 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4924 auto checkSVNonOpUses = [&](Instruction *I) {
4925 if (!I || I->getOperand(0)->getType() != VT)
4926 return true;
4927 return any_of(I->users(), [&](User *U) {
4928 return U != Op0 && U != Op1 &&
4929 !(isa<ShuffleVectorInst>(U) &&
4930 (InputShuffles.contains(cast<Instruction>(U)) ||
4931 isInstructionTriviallyDead(cast<Instruction>(U))));
4932 });
4933 };
4934 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4935 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4936 return false;
4937
4938 // Collect all the uses that are shuffles that we can transform together. We
4939 // may not have a single shuffle, but a group that can all be transformed
4940 // together profitably.
4942 auto collectShuffles = [&](Instruction *I) {
4943 for (auto *U : I->users()) {
4944 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4945 if (!SV || SV->getType() != VT)
4946 return false;
4947 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4948 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4949 return false;
4950 if (!llvm::is_contained(Shuffles, SV))
4951 Shuffles.push_back(SV);
4952 }
4953 return true;
4954 };
4955 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4956 return false;
4957 // From a reduction, we need to be processing a single shuffle, otherwise the
4958 // other uses will not be lane-invariant.
4959 if (FromReduction && Shuffles.size() > 1)
4960 return false;
4961
4962 // Add any shuffle uses for the shuffles we have found, to include them in our
4963 // cost calculations.
4964 if (!FromReduction) {
4965 for (ShuffleVectorInst *SV : Shuffles) {
4966 for (auto *U : SV->users()) {
4967 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4968 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4969 Shuffles.push_back(SSV);
4970 }
4971 }
4972 }
4973
4974 // For each of the output shuffles, we try to sort all the first vector
4975 // elements to the beginning, followed by the second array elements at the
4976 // end. If the binops are legalized to smaller vectors, this may reduce total
4977 // number of binops. We compute the ReconstructMask mask needed to convert
4978 // back to the original lane order.
4980 SmallVector<SmallVector<int>> OrigReconstructMasks;
4981 int MaxV1Elt = 0, MaxV2Elt = 0;
4982 unsigned NumElts = VT->getNumElements();
4983 for (ShuffleVectorInst *SVN : Shuffles) {
4984 SmallVector<int> Mask;
4985 SVN->getShuffleMask(Mask);
4986
4987 // Check the operands are the same as the original, or reversed (in which
4988 // case we need to commute the mask).
4989 Value *SVOp0 = SVN->getOperand(0);
4990 Value *SVOp1 = SVN->getOperand(1);
4991 if (isa<UndefValue>(SVOp1)) {
4992 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4993 SVOp0 = SSV->getOperand(0);
4994 SVOp1 = SSV->getOperand(1);
4995 for (int &Elem : Mask) {
4996 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4997 return false;
4998 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
4999 }
5000 }
5001 if (SVOp0 == Op1 && SVOp1 == Op0) {
5002 std::swap(SVOp0, SVOp1);
5004 }
5005 if (SVOp0 != Op0 || SVOp1 != Op1)
5006 return false;
5007
5008 // Calculate the reconstruction mask for this shuffle, as the mask needed to
5009 // take the packed values from Op0/Op1 and reconstructing to the original
5010 // order.
5011 SmallVector<int> ReconstructMask;
5012 for (unsigned I = 0; I < Mask.size(); I++) {
5013 if (Mask[I] < 0) {
5014 ReconstructMask.push_back(-1);
5015 } else if (Mask[I] < static_cast<int>(NumElts)) {
5016 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
5017 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
5018 return Mask[I] == A.first;
5019 });
5020 if (It != V1.end())
5021 ReconstructMask.push_back(It - V1.begin());
5022 else {
5023 ReconstructMask.push_back(V1.size());
5024 V1.emplace_back(Mask[I], V1.size());
5025 }
5026 } else {
5027 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
5028 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
5029 return Mask[I] - static_cast<int>(NumElts) == A.first;
5030 });
5031 if (It != V2.end())
5032 ReconstructMask.push_back(NumElts + It - V2.begin());
5033 else {
5034 ReconstructMask.push_back(NumElts + V2.size());
5035 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
5036 }
5037 }
5038 }
5039
5040 // For reductions, we know that the lane ordering out doesn't alter the
5041 // result. In-order can help simplify the shuffle away.
5042 if (FromReduction)
5043 sort(ReconstructMask);
5044 OrigReconstructMasks.push_back(std::move(ReconstructMask));
5045 }
5046
5047 // If the Maximum element used from V1 and V2 are not larger than the new
5048 // vectors, the vectors are already packes and performing the optimization
5049 // again will likely not help any further. This also prevents us from getting
5050 // stuck in a cycle in case the costs do not also rule it out.
5051 if (V1.empty() || V2.empty() ||
5052 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
5053 MaxV2Elt == static_cast<int>(V2.size()) - 1))
5054 return false;
5055
5056 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
5057 // shuffle of another shuffle, or not a shuffle (that is treated like a
5058 // identity shuffle).
5059 auto GetBaseMaskValue = [&](Instruction *I, int M) {
5060 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5061 if (!SV)
5062 return M;
5063 if (isa<UndefValue>(SV->getOperand(1)))
5064 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5065 if (InputShuffles.contains(SSV))
5066 return SSV->getMaskValue(SV->getMaskValue(M));
5067 return SV->getMaskValue(M);
5068 };
5069
5070 // Attempt to sort the inputs my ascending mask values to make simpler input
5071 // shuffles and push complex shuffles down to the uses. We sort on the first
5072 // of the two input shuffle orders, to try and get at least one input into a
5073 // nice order.
5074 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
5075 std::pair<int, int> Y) {
5076 int MXA = GetBaseMaskValue(A, X.first);
5077 int MYA = GetBaseMaskValue(A, Y.first);
5078 return MXA < MYA;
5079 };
5080 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
5081 return SortBase(SVI0A, A, B);
5082 });
5083 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
5084 return SortBase(SVI1A, A, B);
5085 });
5086 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5087 // modified order of the input shuffles.
5088 SmallVector<SmallVector<int>> ReconstructMasks;
5089 for (const auto &Mask : OrigReconstructMasks) {
5090 SmallVector<int> ReconstructMask;
5091 for (int M : Mask) {
5092 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5093 auto It = find_if(V, [M](auto A) { return A.second == M; });
5094 assert(It != V.end() && "Expected all entries in Mask");
5095 return std::distance(V.begin(), It);
5096 };
5097 if (M < 0)
5098 ReconstructMask.push_back(-1);
5099 else if (M < static_cast<int>(NumElts)) {
5100 ReconstructMask.push_back(FindIndex(V1, M));
5101 } else {
5102 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5103 }
5104 }
5105 ReconstructMasks.push_back(std::move(ReconstructMask));
5106 }
5107
5108 // Calculate the masks needed for the new input shuffles, which get padded
5109 // with undef
5110 SmallVector<int> V1A, V1B, V2A, V2B;
5111 for (unsigned I = 0; I < V1.size(); I++) {
5112 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5113 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5114 }
5115 for (unsigned I = 0; I < V2.size(); I++) {
5116 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5117 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5118 }
5119 while (V1A.size() < NumElts) {
5122 }
5123 while (V2A.size() < NumElts) {
5126 }
5127
5128 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5129 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5130 if (!SV)
5131 return C;
5132 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5135 VT, VT, SV->getShuffleMask(), CostKind);
5136 };
5137 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5138 return C +
5140 };
5141
5142 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5143 unsigned MaxVectorSize =
5145 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5146 if (MaxElementsInVector == 0)
5147 return false;
5148 // When there are multiple shufflevector operations on the same input,
5149 // especially when the vector length is larger than the register size,
5150 // identical shuffle patterns may occur across different groups of elements.
5151 // To avoid overestimating the cost by counting these repeated shuffles more
5152 // than once, we only account for unique shuffle patterns. This adjustment
5153 // prevents inflated costs in the cost model for wide vectors split into
5154 // several register-sized groups.
5155 std::set<SmallVector<int, 4>> UniqueShuffles;
5156 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5157 // Compute the cost for performing the shuffle over the full vector.
5158 auto ShuffleCost =
5160 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5161 if (NumFullVectors < 2)
5162 return C + ShuffleCost;
5163 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5164 unsigned NumUniqueGroups = 0;
5165 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5166 // For each group of MaxElementsInVector contiguous elements,
5167 // collect their shuffle pattern and insert into the set of unique patterns.
5168 for (unsigned I = 0; I < NumFullVectors; ++I) {
5169 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5170 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5171 if (UniqueShuffles.insert(SubShuffle).second)
5172 NumUniqueGroups += 1;
5173 }
5174 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5175 };
5176 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5177 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5178 if (!SV)
5179 return C;
5180 SmallVector<int, 16> Mask;
5181 SV->getShuffleMask(Mask);
5182 return AddShuffleMaskAdjustedCost(C, Mask);
5183 };
5184 // Check that input consists of ShuffleVectors applied to the same input
5185 auto AllShufflesHaveSameOperands =
5186 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5187 if (InputShuffles.size() < 2)
5188 return false;
5189 ShuffleVectorInst *FirstSV =
5190 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5191 if (!FirstSV)
5192 return false;
5193
5194 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5195 return std::all_of(
5196 std::next(InputShuffles.begin()), InputShuffles.end(),
5197 [&](Instruction *I) {
5198 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5199 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5200 });
5201 };
5202
5203 // Get the costs of the shuffles + binops before and after with the new
5204 // shuffle masks.
5205 InstructionCost CostBefore =
5206 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5207 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5208 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5209 InstructionCost(0), AddShuffleCost);
5210 if (AllShufflesHaveSameOperands(InputShuffles)) {
5211 UniqueShuffles.clear();
5212 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5213 InstructionCost(0), AddShuffleAdjustedCost);
5214 } else {
5215 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5216 InstructionCost(0), AddShuffleCost);
5217 }
5218
5219 // The new binops will be unused for lanes past the used shuffle lengths.
5220 // These types attempt to get the correct cost for that from the target.
5221 FixedVectorType *Op0SmallVT =
5222 FixedVectorType::get(VT->getScalarType(), V1.size());
5223 FixedVectorType *Op1SmallVT =
5224 FixedVectorType::get(VT->getScalarType(), V2.size());
5225 InstructionCost CostAfter =
5226 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5227 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5228 UniqueShuffles.clear();
5229 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5230 InstructionCost(0), AddShuffleMaskAdjustedCost);
5231 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5232 CostAfter +=
5233 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5234 InstructionCost(0), AddShuffleMaskCost);
5235
5236 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5237 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5238 << " vs CostAfter: " << CostAfter << "\n");
5239 if (CostBefore < CostAfter ||
5240 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5241 return false;
5242
5243 // The cost model has passed, create the new instructions.
5244 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5245 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5246 if (!SV)
5247 return I;
5248 if (isa<UndefValue>(SV->getOperand(1)))
5249 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5250 if (InputShuffles.contains(SSV))
5251 return SSV->getOperand(Op);
5252 return SV->getOperand(Op);
5253 };
5254 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5255 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5256 GetShuffleOperand(SVI0A, 1), V1A);
5257 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5258 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5259 GetShuffleOperand(SVI0B, 1), V1B);
5260 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5261 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5262 GetShuffleOperand(SVI1A, 1), V2A);
5263 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5264 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5265 GetShuffleOperand(SVI1B, 1), V2B);
5266 Builder.SetInsertPoint(Op0);
5267 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5268 NSV0A, NSV0B);
5269 if (auto *I = dyn_cast<Instruction>(NOp0))
5270 I->copyIRFlags(Op0, true);
5271 Builder.SetInsertPoint(Op1);
5272 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5273 NSV1A, NSV1B);
5274 if (auto *I = dyn_cast<Instruction>(NOp1))
5275 I->copyIRFlags(Op1, true);
5276
5277 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5278 Builder.SetInsertPoint(Shuffles[S]);
5279 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5280 replaceValue(*Shuffles[S], *NSV, false);
5281 }
5282
5283 Worklist.pushValue(NSV0A);
5284 Worklist.pushValue(NSV0B);
5285 Worklist.pushValue(NSV1A);
5286 Worklist.pushValue(NSV1B);
5287 return true;
5288}
5289
5290/// Check if instruction depends on ZExt and this ZExt can be moved after the
5291/// instruction. Move ZExt if it is profitable. For example:
5292/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5293/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5294/// Cost model calculations takes into account if zext(x) has other users and
5295/// whether it can be propagated through them too.
5296bool VectorCombine::shrinkType(Instruction &I) {
5297 Value *ZExted, *OtherOperand;
5298 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5299 m_Value(OtherOperand))) &&
5300 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5301 return false;
5302
5303 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5304
5305 auto *BigTy = cast<FixedVectorType>(I.getType());
5306 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5307 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5308
5309 if (I.getOpcode() == Instruction::LShr) {
5310 // Check that the shift amount is less than the number of bits in the
5311 // smaller type. Otherwise, the smaller lshr will return a poison value.
5312 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5313 if (ShAmtKB.getMaxValue().uge(BW))
5314 return false;
5315 } else {
5316 // Check that the expression overall uses at most the same number of bits as
5317 // ZExted
5318 KnownBits KB = computeKnownBits(&I, *DL);
5319 if (KB.countMaxActiveBits() > BW)
5320 return false;
5321 }
5322
5323 // Calculate costs of leaving current IR as it is and moving ZExt operation
5324 // later, along with adding truncates if needed
5326 Instruction::ZExt, BigTy, SmallTy,
5327 TargetTransformInfo::CastContextHint::None, CostKind);
5328 InstructionCost CurrentCost = ZExtCost;
5329 InstructionCost ShrinkCost = 0;
5330
5331 // Calculate total cost and check that we can propagate through all ZExt users
5332 for (User *U : ZExtOperand->users()) {
5333 auto *UI = cast<Instruction>(U);
5334 if (UI == &I) {
5335 CurrentCost +=
5336 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5337 ShrinkCost +=
5338 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5339 ShrinkCost += ZExtCost;
5340 continue;
5341 }
5342
5343 if (!Instruction::isBinaryOp(UI->getOpcode()))
5344 return false;
5345
5346 // Check if we can propagate ZExt through its other users
5347 KnownBits KB = computeKnownBits(UI, *DL);
5348 if (KB.countMaxActiveBits() > BW)
5349 return false;
5350
5351 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5352 ShrinkCost +=
5353 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5354 ShrinkCost += ZExtCost;
5355 }
5356
5357 // If the other instruction operand is not a constant, we'll need to
5358 // generate a truncate instruction. So we have to adjust cost
5359 if (!isa<Constant>(OtherOperand))
5360 ShrinkCost += TTI.getCastInstrCost(
5361 Instruction::Trunc, SmallTy, BigTy,
5362 TargetTransformInfo::CastContextHint::None, CostKind);
5363
5364 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5365 // towards modifying the IR because shrinking opens opportunities for other
5366 // shrinking optimisations.
5367 if (ShrinkCost > CurrentCost)
5368 return false;
5369
5370 Builder.SetInsertPoint(&I);
5371 Value *Op0 = ZExted;
5372 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5373 // Keep the order of operands the same
5374 if (I.getOperand(0) == OtherOperand)
5375 std::swap(Op0, Op1);
5376 Value *NewBinOp =
5377 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5378 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5379 cast<Instruction>(NewBinOp)->copyMetadata(I);
5380 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5381 replaceValue(I, *NewZExtr);
5382 return true;
5383}
5384
5385/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5386/// shuffle (DstVec, SrcVec, Mask)
5387bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5388 Value *DstVec, *SrcVec;
5389 uint64_t ExtIdx, InsIdx;
5390 if (!match(&I,
5391 m_InsertElt(m_Value(DstVec),
5392 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5393 m_ConstantInt(InsIdx))))
5394 return false;
5395
5396 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5397 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5398 // We can try combining vectors with different element sizes.
5399 if (!DstVecTy || !SrcVecTy ||
5400 SrcVecTy->getElementType() != DstVecTy->getElementType())
5401 return false;
5402
5403 unsigned NumDstElts = DstVecTy->getNumElements();
5404 unsigned NumSrcElts = SrcVecTy->getNumElements();
5405 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5406 return false;
5407
5408 // Insertion into poison is a cheaper single operand shuffle.
5410 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5411
5412 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5413 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5414 if (NeedDstSrcSwap) {
5416 Mask[InsIdx] = ExtIdx % NumDstElts;
5417 std::swap(DstVec, SrcVec);
5418 } else {
5420 std::iota(Mask.begin(), Mask.end(), 0);
5421 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5422 }
5423
5424 // Cost
5425 auto *Ins = cast<InsertElementInst>(&I);
5426 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5427 InstructionCost InsCost =
5428 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5429 InstructionCost ExtCost =
5430 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5431 InstructionCost OldCost = ExtCost + InsCost;
5432
5433 InstructionCost NewCost = 0;
5434 SmallVector<int> ExtToVecMask;
5435 if (!NeedExpOrNarrow) {
5436 // Ignore 'free' identity insertion shuffle.
5437 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5438 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5439 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5440 nullptr, {DstVec, SrcVec});
5441 } else {
5442 // When creating a length-changing-vector, always try to keep the relevant
5443 // element in an equivalent position, so that bulk shuffles are more likely
5444 // to be useful.
5445 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5446 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5447 // Add cost for expanding or narrowing
5449 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5450 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5451 }
5452
5453 if (!Ext->hasOneUse())
5454 NewCost += ExtCost;
5455
5456 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5457 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5458 << "\n");
5459
5460 if (OldCost < NewCost)
5461 return false;
5462
5463 if (NeedExpOrNarrow) {
5464 if (!NeedDstSrcSwap)
5465 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5466 else
5467 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5468 }
5469
5470 // Canonicalize undef param to RHS to help further folds.
5471 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5472 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5473 std::swap(DstVec, SrcVec);
5474 }
5475
5476 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5477 replaceValue(I, *Shuf);
5478
5479 return true;
5480}
5481
5482/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5483/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5484/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5485/// before casting it back into `<vscale x 16 x i32>`.
5486bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5487 const APInt *SplatVal0, *SplatVal1;
5489 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5490 return false;
5491
5492 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5493 << "\n");
5494
5495 auto *VTy =
5496 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5497 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5498 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5499
5500 // Just in case the cost of interleave2 intrinsic and bitcast are both
5501 // invalid, in which case we want to bail out, we use <= rather
5502 // than < here. Even they both have valid and equal costs, it's probably
5503 // not a good idea to emit a high-cost constant splat.
5505 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5507 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5508 << *I.getType() << " is too high.\n");
5509 return false;
5510 }
5511
5512 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5513 NewSplatVal <<= Width;
5514 NewSplatVal |= SplatVal0->zext(Width * 2);
5515 auto *NewSplat = ConstantVector::getSplat(
5516 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5517
5518 IRBuilder<> Builder(&I);
5519 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5520 return true;
5521}
5522
5523// Attempt to shrink loads that are only used by shufflevector instructions.
5524bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
5525 auto *OldLoad = dyn_cast<LoadInst>(&I);
5526 if (!OldLoad || !OldLoad->isSimple())
5527 return false;
5528
5529 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
5530 if (!OldLoadTy)
5531 return false;
5532
5533 unsigned const OldNumElements = OldLoadTy->getNumElements();
5534
5535 // Search all uses of load. If all uses are shufflevector instructions, and
5536 // the second operands are all poison values, find the minimum and maximum
5537 // indices of the vector elements referenced by all shuffle masks.
5538 // Otherwise return `std::nullopt`.
5539 using IndexRange = std::pair<int, int>;
5540 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
5541 IndexRange OutputRange = IndexRange(OldNumElements, -1);
5542 for (llvm::Use &Use : I.uses()) {
5543 // Ensure all uses match the required pattern.
5544 User *Shuffle = Use.getUser();
5545 ArrayRef<int> Mask;
5546
5547 if (!match(Shuffle,
5548 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
5549 return std::nullopt;
5550
5551 // Ignore shufflevector instructions that have no uses.
5552 if (Shuffle->use_empty())
5553 continue;
5554
5555 // Find the min and max indices used by the shufflevector instruction.
5556 for (int Index : Mask) {
5557 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
5558 OutputRange.first = std::min(Index, OutputRange.first);
5559 OutputRange.second = std::max(Index, OutputRange.second);
5560 }
5561 }
5562 }
5563
5564 if (OutputRange.second < OutputRange.first)
5565 return std::nullopt;
5566
5567 return OutputRange;
5568 };
5569
5570 // Get the range of vector elements used by shufflevector instructions.
5571 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
5572 unsigned const NewNumElements = Indices->second + 1u;
5573
5574 // If the range of vector elements is smaller than the full load, attempt
5575 // to create a smaller load.
5576 if (NewNumElements < OldNumElements) {
5577 IRBuilder Builder(&I);
5578 Builder.SetCurrentDebugLocation(I.getDebugLoc());
5579
5580 // Calculate costs of old and new ops.
5581 Type *ElemTy = OldLoadTy->getElementType();
5582 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
5583 Value *PtrOp = OldLoad->getPointerOperand();
5584
5586 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
5587 OldLoad->getPointerAddressSpace(), CostKind);
5588 InstructionCost NewCost =
5589 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
5590 OldLoad->getPointerAddressSpace(), CostKind);
5591
5592 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
5594 unsigned const MaxIndex = NewNumElements * 2u;
5595
5596 for (llvm::Use &Use : I.uses()) {
5597 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
5598
5599 // Ignore shufflevector instructions that have no uses.
5600 if (Shuffle->use_empty())
5601 continue;
5602
5603 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
5604
5605 // Create entry for new use.
5606 NewUses.push_back({Shuffle, OldMask});
5607
5608 // Validate mask indices.
5609 for (int Index : OldMask) {
5610 if (Index >= static_cast<int>(MaxIndex))
5611 return false;
5612 }
5613
5614 // Update costs.
5615 OldCost +=
5617 OldLoadTy, OldMask, CostKind);
5618 NewCost +=
5620 NewLoadTy, OldMask, CostKind);
5621 }
5622
5623 LLVM_DEBUG(
5624 dbgs() << "Found a load used only by shufflevector instructions: "
5625 << I << "\n OldCost: " << OldCost
5626 << " vs NewCost: " << NewCost << "\n");
5627
5628 if (OldCost < NewCost || !NewCost.isValid())
5629 return false;
5630
5631 // Create new load of smaller vector.
5632 auto *NewLoad = cast<LoadInst>(
5633 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
5634 NewLoad->copyMetadata(I);
5635
5636 // Replace all uses.
5637 for (UseEntry &Use : NewUses) {
5638 ShuffleVectorInst *Shuffle = Use.first;
5639 std::vector<int> &NewMask = Use.second;
5640
5641 Builder.SetInsertPoint(Shuffle);
5642 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
5643 Value *NewShuffle = Builder.CreateShuffleVector(
5644 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
5645
5646 replaceValue(*Shuffle, *NewShuffle, false);
5647 }
5648
5649 return true;
5650 }
5651 }
5652 return false;
5653}
5654
5655// Attempt to narrow a phi of shufflevector instructions where the two incoming
5656// values have the same operands but different masks. If the two shuffle masks
5657// are offsets of one another we can use one branch to rotate the incoming
5658// vector and perform one larger shuffle after the phi.
5659bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
5660 auto *Phi = dyn_cast<PHINode>(&I);
5661 if (!Phi || Phi->getNumIncomingValues() != 2u)
5662 return false;
5663
5664 Value *Op = nullptr;
5665 ArrayRef<int> Mask0;
5666 ArrayRef<int> Mask1;
5667
5668 if (!match(Phi->getOperand(0u),
5669 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
5670 !match(Phi->getOperand(1u),
5671 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
5672 return false;
5673
5674 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
5675
5676 // Ensure result vectors are wider than the argument vector.
5677 auto *InputVT = cast<FixedVectorType>(Op->getType());
5678 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
5679 auto const InputNumElements = InputVT->getNumElements();
5680
5681 if (InputNumElements >= ResultVT->getNumElements())
5682 return false;
5683
5684 // Take the difference of the two shuffle masks at each index. Ignore poison
5685 // values at the same index in both masks.
5686 SmallVector<int, 16> NewMask;
5687 NewMask.reserve(Mask0.size());
5688
5689 for (auto [M0, M1] : zip(Mask0, Mask1)) {
5690 if (M0 >= 0 && M1 >= 0)
5691 NewMask.push_back(M0 - M1);
5692 else if (M0 == -1 && M1 == -1)
5693 continue;
5694 else
5695 return false;
5696 }
5697
5698 // Ensure all elements of the new mask are equal. If the difference between
5699 // the incoming mask elements is the same, the two must be constant offsets
5700 // of one another.
5701 if (NewMask.empty() || !all_equal(NewMask))
5702 return false;
5703
5704 // Create new mask using difference of the two incoming masks.
5705 int MaskOffset = NewMask[0u];
5706 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
5707 NewMask.clear();
5708
5709 for (unsigned I = 0u; I < InputNumElements; ++I) {
5710 NewMask.push_back(Index);
5711 Index = (Index + 1u) % InputNumElements;
5712 }
5713
5714 // Calculate costs for worst cases and compare.
5715 auto const Kind = TTI::SK_PermuteSingleSrc;
5716 auto OldCost =
5717 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
5718 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
5719 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
5720 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
5721
5722 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
5723 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5724 << "\n");
5725
5726 if (NewCost > OldCost)
5727 return false;
5728
5729 // Create new shuffles and narrowed phi.
5730 auto Builder = IRBuilder(Shuf);
5731 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
5732 auto *PoisonVal = PoisonValue::get(InputVT);
5733 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
5734 Worklist.push(cast<Instruction>(NewShuf0));
5735
5736 Builder.SetInsertPoint(Phi);
5737 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
5738 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
5739 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
5740 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
5741
5742 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
5743 PoisonVal = PoisonValue::get(NewPhi->getType());
5744 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
5745
5746 replaceValue(*Phi, *NewShuf1);
5747 return true;
5748}
5749
5750/// This is the entry point for all transforms. Pass manager differences are
5751/// handled in the callers of this function.
5752bool VectorCombine::run() {
5754 return false;
5755
5756 // Don't attempt vectorization if the target does not support vectors.
5757 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
5758 return false;
5759
5760 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
5761
5762 auto FoldInst = [this](Instruction &I) {
5763 Builder.SetInsertPoint(&I);
5764 bool IsVectorType = isa<VectorType>(I.getType());
5765 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
5766 auto Opcode = I.getOpcode();
5767
5768 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
5769
5770 // These folds should be beneficial regardless of when this pass is run
5771 // in the optimization pipeline.
5772 // The type checking is for run-time efficiency. We can avoid wasting time
5773 // dispatching to folding functions if there's no chance of matching.
5774 if (IsFixedVectorType) {
5775 switch (Opcode) {
5776 case Instruction::InsertElement:
5777 if (vectorizeLoadInsert(I))
5778 return true;
5779 break;
5780 case Instruction::ShuffleVector:
5781 if (widenSubvectorLoad(I))
5782 return true;
5783 break;
5784 default:
5785 break;
5786 }
5787 }
5788
5789 // This transform works with scalable and fixed vectors
5790 // TODO: Identify and allow other scalable transforms
5791 if (IsVectorType) {
5792 if (scalarizeOpOrCmp(I))
5793 return true;
5794 if (scalarizeLoad(I))
5795 return true;
5796 if (scalarizeExtExtract(I))
5797 return true;
5798 if (scalarizeVPIntrinsic(I))
5799 return true;
5800 if (foldInterleaveIntrinsics(I))
5801 return true;
5802 }
5803
5804 if (Opcode == Instruction::Store)
5805 if (foldSingleElementStore(I))
5806 return true;
5807
5808 // If this is an early pipeline invocation of this pass, we are done.
5809 if (TryEarlyFoldsOnly)
5810 return false;
5811
5812 // Otherwise, try folds that improve codegen but may interfere with
5813 // early IR canonicalizations.
5814 // The type checking is for run-time efficiency. We can avoid wasting time
5815 // dispatching to folding functions if there's no chance of matching.
5816 if (IsFixedVectorType) {
5817 switch (Opcode) {
5818 case Instruction::InsertElement:
5819 if (foldInsExtFNeg(I))
5820 return true;
5821 if (foldInsExtBinop(I))
5822 return true;
5823 if (foldInsExtVectorToShuffle(I))
5824 return true;
5825 break;
5826 case Instruction::ShuffleVector:
5827 if (foldPermuteOfBinops(I))
5828 return true;
5829 if (foldShuffleOfBinops(I))
5830 return true;
5831 if (foldShuffleOfSelects(I))
5832 return true;
5833 if (foldShuffleOfCastops(I))
5834 return true;
5835 if (foldShuffleOfShuffles(I))
5836 return true;
5837 if (foldPermuteOfIntrinsic(I))
5838 return true;
5839 if (foldShufflesOfLengthChangingShuffles(I))
5840 return true;
5841 if (foldShuffleOfIntrinsics(I))
5842 return true;
5843 if (foldSelectShuffle(I))
5844 return true;
5845 if (foldShuffleToIdentity(I))
5846 return true;
5847 break;
5848 case Instruction::Load:
5849 if (shrinkLoadForShuffles(I))
5850 return true;
5851 break;
5852 case Instruction::BitCast:
5853 if (foldBitcastShuffle(I))
5854 return true;
5855 if (foldSelectsFromBitcast(I))
5856 return true;
5857 break;
5858 case Instruction::And:
5859 case Instruction::Or:
5860 case Instruction::Xor:
5861 if (foldBitOpOfCastops(I))
5862 return true;
5863 if (foldBitOpOfCastConstant(I))
5864 return true;
5865 break;
5866 case Instruction::PHI:
5867 if (shrinkPhiOfShuffles(I))
5868 return true;
5869 break;
5870 default:
5871 if (shrinkType(I))
5872 return true;
5873 break;
5874 }
5875 } else {
5876 switch (Opcode) {
5877 case Instruction::Call:
5878 if (foldShuffleFromReductions(I))
5879 return true;
5880 if (foldCastFromReductions(I))
5881 return true;
5882 break;
5883 case Instruction::ExtractElement:
5884 if (foldShuffleChainsToReduce(I))
5885 return true;
5886 break;
5887 case Instruction::ICmp:
5888 if (foldSignBitReductionCmp(I))
5889 return true;
5890 if (foldICmpEqZeroVectorReduce(I))
5891 return true;
5892 if (foldEquivalentReductionCmp(I))
5893 return true;
5894 [[fallthrough]];
5895 case Instruction::FCmp:
5896 if (foldExtractExtract(I))
5897 return true;
5898 break;
5899 case Instruction::Or:
5900 if (foldConcatOfBoolMasks(I))
5901 return true;
5902 [[fallthrough]];
5903 default:
5904 if (Instruction::isBinaryOp(Opcode)) {
5905 if (foldExtractExtract(I))
5906 return true;
5907 if (foldExtractedCmps(I))
5908 return true;
5909 if (foldBinopOfReductions(I))
5910 return true;
5911 }
5912 break;
5913 }
5914 }
5915 return false;
5916 };
5917
5918 bool MadeChange = false;
5919 for (BasicBlock &BB : F) {
5920 // Ignore unreachable basic blocks.
5921 if (!DT.isReachableFromEntry(&BB))
5922 continue;
5923 // Use early increment range so that we can erase instructions in loop.
5924 // make_early_inc_range is not applicable here, as the next iterator may
5925 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5926 // We manually maintain the next instruction and update it when it is about
5927 // to be deleted.
5928 Instruction *I = &BB.front();
5929 while (I) {
5930 NextInst = I->getNextNode();
5931 if (!I->isDebugOrPseudoInst())
5932 MadeChange |= FoldInst(*I);
5933 I = NextInst;
5934 }
5935 }
5936
5937 NextInst = nullptr;
5938
5939 while (!Worklist.isEmpty()) {
5940 Instruction *I = Worklist.removeOne();
5941 if (!I)
5942 continue;
5943
5946 continue;
5947 }
5948
5949 MadeChange |= FoldInst(*I);
5950 }
5951
5952 return MadeChange;
5953}
5954
5957 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5959 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5960 AAResults &AA = FAM.getResult<AAManager>(F);
5961 const DataLayout *DL = &F.getDataLayout();
5962 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5963 TryEarlyFoldsOnly);
5964 if (!Combiner.run())
5965 return PreservedAnalyses::all();
5968 return PA;
5969}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1449
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1630
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2561
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1871
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1516
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2210
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2651
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1952
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2235
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2442
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2473
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2646
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1854
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1495
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2054
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2583
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1554
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1867
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2040
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:604
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1711
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1802
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2418
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1576
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:761
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:427
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:544
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:347
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:390
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2263
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2268
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
class_match< IntrinsicInst > m_AnyIntrinsic()
Matches any intrinsic call and ignore it.
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:538
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:431
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:406
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:446
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:312
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
SimplifyQuery getWithInstruction(const Instruction *I) const