LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
39#include <numeric>
40#include <optional>
41#include <queue>
42#include <set>
43
44#define DEBUG_TYPE "vector-combine"
46
47using namespace llvm;
48using namespace llvm::PatternMatch;
49
50STATISTIC(NumVecLoad, "Number of vector loads formed");
51STATISTIC(NumVecCmp, "Number of vector compares formed");
52STATISTIC(NumVecBO, "Number of vector binops formed");
53STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
54STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
55STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
56STATISTIC(NumScalarCmp, "Number of scalar compares formed");
57STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
58
60 "disable-vector-combine", cl::init(false), cl::Hidden,
61 cl::desc("Disable all vector combine transforms"));
62
64 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
65 cl::desc("Disable binop extract to shuffle transforms"));
66
68 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
69 cl::desc("Max number of instructions to scan for vector combining."));
70
71static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
72
73namespace {
74class VectorCombine {
75public:
76 VectorCombine(Function &F, const TargetTransformInfo &TTI,
79 bool TryEarlyFoldsOnly)
80 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
81 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
82 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
83
84 bool run();
85
86private:
87 Function &F;
89 const TargetTransformInfo &TTI;
90 const DominatorTree &DT;
91 AAResults &AA;
92 AssumptionCache &AC;
93 const DataLayout *DL;
94 TTI::TargetCostKind CostKind;
95 const SimplifyQuery SQ;
96
97 /// If true, only perform beneficial early IR transforms. Do not introduce new
98 /// vector operations.
99 bool TryEarlyFoldsOnly;
100
101 InstructionWorklist Worklist;
102
103 /// Next instruction to iterate. It will be updated when it is erased by
104 /// RecursivelyDeleteTriviallyDeadInstructions.
105 Instruction *NextInst;
106
107 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
108 // parameter. That should be updated to specific sub-classes because the
109 // run loop was changed to dispatch on opcode.
110 bool vectorizeLoadInsert(Instruction &I);
111 bool widenSubvectorLoad(Instruction &I);
112 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
113 ExtractElementInst *Ext1,
114 unsigned PreferredExtractIndex) const;
115 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
116 const Instruction &I,
117 ExtractElementInst *&ConvertToShuffle,
118 unsigned PreferredExtractIndex);
119 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 bool foldExtractExtract(Instruction &I);
122 bool foldInsExtFNeg(Instruction &I);
123 bool foldInsExtBinop(Instruction &I);
124 bool foldInsExtVectorToShuffle(Instruction &I);
125 bool foldBitOpOfCastops(Instruction &I);
126 bool foldBitOpOfCastConstant(Instruction &I);
127 bool foldBitcastShuffle(Instruction &I);
128 bool scalarizeOpOrCmp(Instruction &I);
129 bool scalarizeVPIntrinsic(Instruction &I);
130 bool foldExtractedCmps(Instruction &I);
131 bool foldSelectsFromBitcast(Instruction &I);
132 bool foldBinopOfReductions(Instruction &I);
133 bool foldSingleElementStore(Instruction &I);
134 bool scalarizeLoad(Instruction &I);
135 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
136 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeExtExtract(Instruction &I);
138 bool foldConcatOfBoolMasks(Instruction &I);
139 bool foldPermuteOfBinops(Instruction &I);
140 bool foldShuffleOfBinops(Instruction &I);
141 bool foldShuffleOfSelects(Instruction &I);
142 bool foldShuffleOfCastops(Instruction &I);
143 bool foldShuffleOfShuffles(Instruction &I);
144 bool foldPermuteOfIntrinsic(Instruction &I);
145 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
146 bool foldShuffleOfIntrinsics(Instruction &I);
147 bool foldShuffleToIdentity(Instruction &I);
148 bool foldShuffleFromReductions(Instruction &I);
149 bool foldShuffleChainsToReduce(Instruction &I);
150 bool foldCastFromReductions(Instruction &I);
151 bool foldSignBitReductionCmp(Instruction &I);
152 bool foldICmpEqZeroVectorReduce(Instruction &I);
153 bool foldEquivalentReductionCmp(Instruction &I);
154 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
155 bool foldInterleaveIntrinsics(Instruction &I);
156 bool shrinkType(Instruction &I);
157 bool shrinkLoadForShuffles(Instruction &I);
158 bool shrinkPhiOfShuffles(Instruction &I);
159
160 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
161 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
162 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
163 Old.replaceAllUsesWith(&New);
164 if (auto *NewI = dyn_cast<Instruction>(&New)) {
165 New.takeName(&Old);
166 Worklist.pushUsersToWorkList(*NewI);
167 Worklist.pushValue(NewI);
168 }
169 if (Erase && isInstructionTriviallyDead(&Old)) {
170 eraseInstruction(Old);
171 } else {
172 Worklist.push(&Old);
173 }
174 }
175
176 void eraseInstruction(Instruction &I) {
177 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
178 SmallVector<Value *> Ops(I.operands());
179 Worklist.remove(&I);
180 I.eraseFromParent();
181
182 // Push remaining users of the operands and then the operand itself - allows
183 // further folds that were hindered by OneUse limits.
184 SmallPtrSet<Value *, 4> Visited;
185 for (Value *Op : Ops) {
186 if (!Visited.contains(Op)) {
187 if (auto *OpI = dyn_cast<Instruction>(Op)) {
189 OpI, nullptr, nullptr, [&](Value *V) {
190 if (auto *I = dyn_cast<Instruction>(V)) {
191 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
192 Worklist.remove(I);
193 if (I == NextInst)
194 NextInst = NextInst->getNextNode();
195 Visited.insert(I);
196 }
197 }))
198 continue;
199 Worklist.pushUsersToWorkList(*OpI);
200 Worklist.pushValue(OpI);
201 }
202 }
203 }
204 }
205};
206} // namespace
207
208/// Return the source operand of a potentially bitcasted value. If there is no
209/// bitcast, return the input value itself.
211 while (auto *BitCast = dyn_cast<BitCastInst>(V))
212 V = BitCast->getOperand(0);
213 return V;
214}
215
216static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
217 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
218 // The widened load may load data from dirty regions or create data races
219 // non-existent in the source.
220 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
221 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
223 return false;
224
225 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
226 // sure we have all of our type-based constraints in place for this target.
227 Type *ScalarTy = Load->getType()->getScalarType();
228 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
229 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
230 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
231 ScalarSize % 8 != 0)
232 return false;
233
234 return true;
235}
236
237bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
238 // Match insert into fixed vector of scalar value.
239 // TODO: Handle non-zero insert index.
240 Value *Scalar;
241 if (!match(&I,
243 return false;
244
245 // Optionally match an extract from another vector.
246 Value *X;
247 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
248 if (!HasExtract)
249 X = Scalar;
250
251 auto *Load = dyn_cast<LoadInst>(X);
252 if (!canWidenLoad(Load, TTI))
253 return false;
254
255 Type *ScalarTy = Scalar->getType();
256 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
257 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
258
259 // Check safety of replacing the scalar load with a larger vector load.
260 // We use minimal alignment (maximum flexibility) because we only care about
261 // the dereferenceable region. When calculating cost and creating a new op,
262 // we may use a larger value based on alignment attributes.
263 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
264 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
265
266 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
267 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
268 unsigned OffsetEltIndex = 0;
269 Align Alignment = Load->getAlign();
270 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
271 &DT)) {
272 // It is not safe to load directly from the pointer, but we can still peek
273 // through gep offsets and check if it safe to load from a base address with
274 // updated alignment. If it is, we can shuffle the element(s) into place
275 // after loading.
276 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
277 APInt Offset(OffsetBitWidth, 0);
279
280 // We want to shuffle the result down from a high element of a vector, so
281 // the offset must be positive.
282 if (Offset.isNegative())
283 return false;
284
285 // The offset must be a multiple of the scalar element to shuffle cleanly
286 // in the element's size.
287 uint64_t ScalarSizeInBytes = ScalarSize / 8;
288 if (Offset.urem(ScalarSizeInBytes) != 0)
289 return false;
290
291 // If we load MinVecNumElts, will our target element still be loaded?
292 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
293 if (OffsetEltIndex >= MinVecNumElts)
294 return false;
295
296 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
297 &DT))
298 return false;
299
300 // Update alignment with offset value. Note that the offset could be negated
301 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
302 // negation does not change the result of the alignment calculation.
303 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
304 }
305
306 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
307 // Use the greater of the alignment on the load or its source pointer.
308 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
309 Type *LoadTy = Load->getType();
310 unsigned AS = Load->getPointerAddressSpace();
311 InstructionCost OldCost =
312 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
313 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
314 OldCost +=
315 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
316 /* Insert */ true, HasExtract, CostKind);
317
318 // New pattern: load VecPtr
319 InstructionCost NewCost =
320 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
321 // Optionally, we are shuffling the loaded vector element(s) into place.
322 // For the mask set everything but element 0 to undef to prevent poison from
323 // propagating from the extra loaded memory. This will also optionally
324 // shrink/grow the vector from the loaded size to the output size.
325 // We assume this operation has no cost in codegen if there was no offset.
326 // Note that we could use freeze to avoid poison problems, but then we might
327 // still need a shuffle to change the vector size.
328 auto *Ty = cast<FixedVectorType>(I.getType());
329 unsigned OutputNumElts = Ty->getNumElements();
330 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
331 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
332 Mask[0] = OffsetEltIndex;
333 if (OffsetEltIndex)
334 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
335 CostKind);
336
337 // We can aggressively convert to the vector form because the backend can
338 // invert this transform if it does not result in a performance win.
339 if (OldCost < NewCost || !NewCost.isValid())
340 return false;
341
342 // It is safe and potentially profitable to load a vector directly:
343 // inselt undef, load Scalar, 0 --> load VecPtr
344 IRBuilder<> Builder(Load);
345 Value *CastedPtr =
346 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
347 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
348 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
349
350 replaceValue(I, *VecLd);
351 ++NumVecLoad;
352 return true;
353}
354
355/// If we are loading a vector and then inserting it into a larger vector with
356/// undefined elements, try to load the larger vector and eliminate the insert.
357/// This removes a shuffle in IR and may allow combining of other loaded values.
358bool VectorCombine::widenSubvectorLoad(Instruction &I) {
359 // Match subvector insert of fixed vector.
360 auto *Shuf = cast<ShuffleVectorInst>(&I);
361 if (!Shuf->isIdentityWithPadding())
362 return false;
363
364 // Allow a non-canonical shuffle mask that is choosing elements from op1.
365 unsigned NumOpElts =
366 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
367 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
368 return M >= (int)(NumOpElts);
369 });
370
371 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
372 if (!canWidenLoad(Load, TTI))
373 return false;
374
375 // We use minimal alignment (maximum flexibility) because we only care about
376 // the dereferenceable region. When calculating cost and creating a new op,
377 // we may use a larger value based on alignment attributes.
378 auto *Ty = cast<FixedVectorType>(I.getType());
379 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
380 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
381 Align Alignment = Load->getAlign();
382 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
383 return false;
384
385 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
386 Type *LoadTy = Load->getType();
387 unsigned AS = Load->getPointerAddressSpace();
388
389 // Original pattern: insert_subvector (load PtrOp)
390 // This conservatively assumes that the cost of a subvector insert into an
391 // undef value is 0. We could add that cost if the cost model accurately
392 // reflects the real cost of that operation.
393 InstructionCost OldCost =
394 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
395
396 // New pattern: load PtrOp
397 InstructionCost NewCost =
398 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
399
400 // We can aggressively convert to the vector form because the backend can
401 // invert this transform if it does not result in a performance win.
402 if (OldCost < NewCost || !NewCost.isValid())
403 return false;
404
405 IRBuilder<> Builder(Load);
406 Value *CastedPtr =
407 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
408 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
409 replaceValue(I, *VecLd);
410 ++NumVecLoad;
411 return true;
412}
413
414/// Determine which, if any, of the inputs should be replaced by a shuffle
415/// followed by extract from a different index.
416ExtractElementInst *VectorCombine::getShuffleExtract(
417 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
418 unsigned PreferredExtractIndex = InvalidIndex) const {
419 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
420 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
421 assert(Index0C && Index1C && "Expected constant extract indexes");
422
423 unsigned Index0 = Index0C->getZExtValue();
424 unsigned Index1 = Index1C->getZExtValue();
425
426 // If the extract indexes are identical, no shuffle is needed.
427 if (Index0 == Index1)
428 return nullptr;
429
430 Type *VecTy = Ext0->getVectorOperand()->getType();
431 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
432 InstructionCost Cost0 =
433 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
434 InstructionCost Cost1 =
435 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
436
437 // If both costs are invalid no shuffle is needed
438 if (!Cost0.isValid() && !Cost1.isValid())
439 return nullptr;
440
441 // We are extracting from 2 different indexes, so one operand must be shuffled
442 // before performing a vector operation and/or extract. The more expensive
443 // extract will be replaced by a shuffle.
444 if (Cost0 > Cost1)
445 return Ext0;
446 if (Cost1 > Cost0)
447 return Ext1;
448
449 // If the costs are equal and there is a preferred extract index, shuffle the
450 // opposite operand.
451 if (PreferredExtractIndex == Index0)
452 return Ext1;
453 if (PreferredExtractIndex == Index1)
454 return Ext0;
455
456 // Otherwise, replace the extract with the higher index.
457 return Index0 > Index1 ? Ext0 : Ext1;
458}
459
460/// Compare the relative costs of 2 extracts followed by scalar operation vs.
461/// vector operation(s) followed by extract. Return true if the existing
462/// instructions are cheaper than a vector alternative. Otherwise, return false
463/// and if one of the extracts should be transformed to a shufflevector, set
464/// \p ConvertToShuffle to that extract instruction.
465bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
466 ExtractElementInst *Ext1,
467 const Instruction &I,
468 ExtractElementInst *&ConvertToShuffle,
469 unsigned PreferredExtractIndex) {
470 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
471 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
472 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
473
474 unsigned Opcode = I.getOpcode();
475 Value *Ext0Src = Ext0->getVectorOperand();
476 Value *Ext1Src = Ext1->getVectorOperand();
477 Type *ScalarTy = Ext0->getType();
478 auto *VecTy = cast<VectorType>(Ext0Src->getType());
479 InstructionCost ScalarOpCost, VectorOpCost;
480
481 // Get cost estimates for scalar and vector versions of the operation.
482 bool IsBinOp = Instruction::isBinaryOp(Opcode);
483 if (IsBinOp) {
484 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
485 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
486 } else {
487 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
488 "Expected a compare");
489 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
490 ScalarOpCost = TTI.getCmpSelInstrCost(
491 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
492 VectorOpCost = TTI.getCmpSelInstrCost(
493 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
494 }
495
496 // Get cost estimates for the extract elements. These costs will factor into
497 // both sequences.
498 unsigned Ext0Index = Ext0IndexC->getZExtValue();
499 unsigned Ext1Index = Ext1IndexC->getZExtValue();
500
501 InstructionCost Extract0Cost =
502 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
503 InstructionCost Extract1Cost =
504 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
505
506 // A more expensive extract will always be replaced by a splat shuffle.
507 // For example, if Ext0 is more expensive:
508 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
509 // extelt (opcode (splat V0, Ext0), V1), Ext1
510 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
511 // check the cost of creating a broadcast shuffle and shuffling both
512 // operands to element 0.
513 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
514 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
515 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
516
517 // Extra uses of the extracts mean that we include those costs in the
518 // vector total because those instructions will not be eliminated.
519 InstructionCost OldCost, NewCost;
520 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
521 // Handle a special case. If the 2 extracts are identical, adjust the
522 // formulas to account for that. The extra use charge allows for either the
523 // CSE'd pattern or an unoptimized form with identical values:
524 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
525 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
526 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
527 OldCost = CheapExtractCost + ScalarOpCost;
528 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
529 } else {
530 // Handle the general case. Each extract is actually a different value:
531 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
532 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
533 NewCost = VectorOpCost + CheapExtractCost +
534 !Ext0->hasOneUse() * Extract0Cost +
535 !Ext1->hasOneUse() * Extract1Cost;
536 }
537
538 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
539 if (ConvertToShuffle) {
540 if (IsBinOp && DisableBinopExtractShuffle)
541 return true;
542
543 // If we are extracting from 2 different indexes, then one operand must be
544 // shuffled before performing the vector operation. The shuffle mask is
545 // poison except for 1 lane that is being translated to the remaining
546 // extraction lane. Therefore, it is a splat shuffle. Ex:
547 // ShufMask = { poison, poison, 0, poison }
548 // TODO: The cost model has an option for a "broadcast" shuffle
549 // (splat-from-element-0), but no option for a more general splat.
550 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
551 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
553 ShuffleMask[BestInsIndex] = BestExtIndex;
555 VecTy, VecTy, ShuffleMask, CostKind, 0,
556 nullptr, {ConvertToShuffle});
557 } else {
559 VecTy, VecTy, {}, CostKind, 0, nullptr,
560 {ConvertToShuffle});
561 }
562 }
563
564 // Aggressively form a vector op if the cost is equal because the transform
565 // may enable further optimization.
566 // Codegen can reverse this transform (scalarize) if it was not profitable.
567 return OldCost < NewCost;
568}
569
570/// Create a shuffle that translates (shifts) 1 element from the input vector
571/// to a new element location.
572static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
573 unsigned NewIndex, IRBuilderBase &Builder) {
574 // The shuffle mask is poison except for 1 lane that is being translated
575 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
576 // ShufMask = { 2, poison, poison, poison }
577 auto *VecTy = cast<FixedVectorType>(Vec->getType());
578 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
579 ShufMask[NewIndex] = OldIndex;
580 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
581}
582
583/// Given an extract element instruction with constant index operand, shuffle
584/// the source vector (shift the scalar element) to a NewIndex for extraction.
585/// Return null if the input can be constant folded, so that we are not creating
586/// unnecessary instructions.
587static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
588 IRBuilderBase &Builder) {
589 // Shufflevectors can only be created for fixed-width vectors.
590 Value *X = ExtElt->getVectorOperand();
591 if (!isa<FixedVectorType>(X->getType()))
592 return nullptr;
593
594 // If the extract can be constant-folded, this code is unsimplified. Defer
595 // to other passes to handle that.
596 Value *C = ExtElt->getIndexOperand();
597 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
598 if (isa<Constant>(X))
599 return nullptr;
600
601 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
602 NewIndex, Builder);
603 return Shuf;
604}
605
606/// Try to reduce extract element costs by converting scalar compares to vector
607/// compares followed by extract.
608/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
609Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
610 Instruction &I) {
611 assert(isa<CmpInst>(&I) && "Expected a compare");
612
613 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
614 // --> extelt (cmp Pred V0, V1), ExtIndex
615 ++NumVecCmp;
616 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
617 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
618 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
619}
620
621/// Try to reduce extract element costs by converting scalar binops to vector
622/// binops followed by extract.
623/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
624Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
625 Instruction &I) {
626 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
627
628 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
629 // --> extelt (bo V0, V1), ExtIndex
630 ++NumVecBO;
631 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
632 V1, "foldExtExtBinop");
633
634 // All IR flags are safe to back-propagate because any potential poison
635 // created in unused vector elements is discarded by the extract.
636 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
637 VecBOInst->copyIRFlags(&I);
638
639 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
640}
641
642/// Match an instruction with extracted vector operands.
643bool VectorCombine::foldExtractExtract(Instruction &I) {
644 // It is not safe to transform things like div, urem, etc. because we may
645 // create undefined behavior when executing those on unknown vector elements.
647 return false;
648
649 Instruction *I0, *I1;
650 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
651 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
653 return false;
654
655 Value *V0, *V1;
656 uint64_t C0, C1;
657 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
658 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
659 V0->getType() != V1->getType())
660 return false;
661
662 // If the scalar value 'I' is going to be re-inserted into a vector, then try
663 // to create an extract to that same element. The extract/insert can be
664 // reduced to a "select shuffle".
665 // TODO: If we add a larger pattern match that starts from an insert, this
666 // probably becomes unnecessary.
667 auto *Ext0 = cast<ExtractElementInst>(I0);
668 auto *Ext1 = cast<ExtractElementInst>(I1);
669 uint64_t InsertIndex = InvalidIndex;
670 if (I.hasOneUse())
671 match(I.user_back(),
672 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
673
674 ExtractElementInst *ExtractToChange;
675 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
676 return false;
677
678 Value *ExtOp0 = Ext0->getVectorOperand();
679 Value *ExtOp1 = Ext1->getVectorOperand();
680
681 if (ExtractToChange) {
682 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
683 Value *NewExtOp =
684 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
685 if (!NewExtOp)
686 return false;
687 if (ExtractToChange == Ext0)
688 ExtOp0 = NewExtOp;
689 else
690 ExtOp1 = NewExtOp;
691 }
692
693 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
694 : Ext0->getIndexOperand();
695 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
696 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
697 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
698 Worklist.push(Ext0);
699 Worklist.push(Ext1);
700 replaceValue(I, *NewExt);
701 return true;
702}
703
704/// Try to replace an extract + scalar fneg + insert with a vector fneg +
705/// shuffle.
706bool VectorCombine::foldInsExtFNeg(Instruction &I) {
707 // Match an insert (op (extract)) pattern.
708 Value *DstVec;
709 uint64_t ExtIdx, InsIdx;
710 Instruction *FNeg;
711 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
712 m_ConstantInt(InsIdx))))
713 return false;
714
715 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
716 Value *SrcVec;
717 Instruction *Extract;
718 if (!match(FNeg, m_FNeg(m_CombineAnd(
719 m_Instruction(Extract),
720 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
721 return false;
722
723 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
724 auto *DstVecScalarTy = DstVecTy->getScalarType();
725 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
726 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
727 return false;
728
729 // Ignore if insert/extract index is out of bounds or destination vector has
730 // one element
731 unsigned NumDstElts = DstVecTy->getNumElements();
732 unsigned NumSrcElts = SrcVecTy->getNumElements();
733 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
734 return false;
735
736 // We are inserting the negated element into the same lane that we extracted
737 // from. This is equivalent to a select-shuffle that chooses all but the
738 // negated element from the destination vector.
739 SmallVector<int> Mask(NumDstElts);
740 std::iota(Mask.begin(), Mask.end(), 0);
741 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
742 InstructionCost OldCost =
743 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
744 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
745
746 // If the extract has one use, it will be eliminated, so count it in the
747 // original cost. If it has more than one use, ignore the cost because it will
748 // be the same before/after.
749 if (Extract->hasOneUse())
750 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
751
752 InstructionCost NewCost =
753 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
755 DstVecTy, Mask, CostKind);
756
757 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
758 // If the lengths of the two vectors are not equal,
759 // we need to add a length-change vector. Add this cost.
760 SmallVector<int> SrcMask;
761 if (NeedLenChg) {
762 SrcMask.assign(NumDstElts, PoisonMaskElem);
763 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
765 DstVecTy, SrcVecTy, SrcMask, CostKind);
766 }
767
768 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
769 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
770 << "\n");
771 if (NewCost > OldCost)
772 return false;
773
774 Value *NewShuf, *LenChgShuf = nullptr;
775 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
776 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
777 if (NeedLenChg) {
778 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
779 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
780 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
781 Worklist.pushValue(LenChgShuf);
782 } else {
783 // shuffle DstVec, (fneg SrcVec), Mask
784 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
785 }
786
787 Worklist.pushValue(VecFNeg);
788 replaceValue(I, *NewShuf);
789 return true;
790}
791
792/// Try to fold insert(binop(x,y),binop(a,b),idx)
793/// --> binop(insert(x,a,idx),insert(y,b,idx))
794bool VectorCombine::foldInsExtBinop(Instruction &I) {
795 BinaryOperator *VecBinOp, *SclBinOp;
796 uint64_t Index;
797 if (!match(&I,
798 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
799 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
800 return false;
801
802 // TODO: Add support for addlike etc.
803 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
804 if (BinOpcode != SclBinOp->getOpcode())
805 return false;
806
807 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
808 if (!ResultTy)
809 return false;
810
811 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
812 // shuffle?
813
815 TTI.getInstructionCost(VecBinOp, CostKind) +
817 InstructionCost NewCost =
818 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
819 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
820 Index, VecBinOp->getOperand(0),
821 SclBinOp->getOperand(0)) +
822 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
823 Index, VecBinOp->getOperand(1),
824 SclBinOp->getOperand(1));
825
826 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
827 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
828 << "\n");
829 if (NewCost > OldCost)
830 return false;
831
832 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
833 SclBinOp->getOperand(0), Index);
834 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
835 SclBinOp->getOperand(1), Index);
836 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
837
838 // Intersect flags from the old binops.
839 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
840 NewInst->copyIRFlags(VecBinOp);
841 NewInst->andIRFlags(SclBinOp);
842 }
843
844 Worklist.pushValue(NewIns0);
845 Worklist.pushValue(NewIns1);
846 replaceValue(I, *NewBO);
847 return true;
848}
849
850/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
851/// Supports: bitcast, trunc, sext, zext
852bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
853 // Check if this is a bitwise logic operation
854 auto *BinOp = dyn_cast<BinaryOperator>(&I);
855 if (!BinOp || !BinOp->isBitwiseLogicOp())
856 return false;
857
858 // Get the cast instructions
859 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
860 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
861 if (!LHSCast || !RHSCast) {
862 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
863 return false;
864 }
865
866 // Both casts must be the same type
867 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
868 if (CastOpcode != RHSCast->getOpcode())
869 return false;
870
871 // Only handle supported cast operations
872 switch (CastOpcode) {
873 case Instruction::BitCast:
874 case Instruction::Trunc:
875 case Instruction::SExt:
876 case Instruction::ZExt:
877 break;
878 default:
879 return false;
880 }
881
882 Value *LHSSrc = LHSCast->getOperand(0);
883 Value *RHSSrc = RHSCast->getOperand(0);
884
885 // Source types must match
886 if (LHSSrc->getType() != RHSSrc->getType())
887 return false;
888
889 auto *SrcTy = LHSSrc->getType();
890 auto *DstTy = I.getType();
891 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
892 // Other casts only handle vector types with integer elements.
893 if (CastOpcode != Instruction::BitCast &&
894 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
895 return false;
896
897 // Only integer scalar/vector values are legal for bitwise logic operations.
898 if (!SrcTy->getScalarType()->isIntegerTy() ||
899 !DstTy->getScalarType()->isIntegerTy())
900 return false;
901
902 // Cost Check :
903 // OldCost = bitlogic + 2*casts
904 // NewCost = bitlogic + cast
905
906 // Calculate specific costs for each cast with instruction context
908 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
910 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
911
912 InstructionCost OldCost =
913 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
914 LHSCastCost + RHSCastCost;
915
916 // For new cost, we can't provide an instruction (it doesn't exist yet)
917 InstructionCost GenericCastCost = TTI.getCastInstrCost(
918 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
919
920 InstructionCost NewCost =
921 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
922 GenericCastCost;
923
924 // Account for multi-use casts using specific costs
925 if (!LHSCast->hasOneUse())
926 NewCost += LHSCastCost;
927 if (!RHSCast->hasOneUse())
928 NewCost += RHSCastCost;
929
930 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
931 << " NewCost=" << NewCost << "\n");
932
933 if (NewCost > OldCost)
934 return false;
935
936 // Create the operation on the source type
937 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
938 BinOp->getName() + ".inner");
939 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
940 NewBinOp->copyIRFlags(BinOp);
941
942 Worklist.pushValue(NewOp);
943
944 // Create the cast operation directly to ensure we get a new instruction
945 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
946
947 // Preserve cast instruction flags
948 NewCast->copyIRFlags(LHSCast);
949 NewCast->andIRFlags(RHSCast);
950
951 // Insert the new instruction
952 Value *Result = Builder.Insert(NewCast);
953
954 replaceValue(I, *Result);
955 return true;
956}
957
958/// Match:
959// bitop(castop(x), C) ->
960// bitop(castop(x), castop(InvC)) ->
961// castop(bitop(x, InvC))
962// Supports: bitcast
963bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
965 Constant *C;
966
967 // Check if this is a bitwise logic operation
969 return false;
970
971 // Get the cast instructions
972 auto *LHSCast = dyn_cast<CastInst>(LHS);
973 if (!LHSCast)
974 return false;
975
976 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
977
978 // Only handle supported cast operations
979 switch (CastOpcode) {
980 case Instruction::BitCast:
981 case Instruction::ZExt:
982 case Instruction::SExt:
983 case Instruction::Trunc:
984 break;
985 default:
986 return false;
987 }
988
989 Value *LHSSrc = LHSCast->getOperand(0);
990
991 auto *SrcTy = LHSSrc->getType();
992 auto *DstTy = I.getType();
993 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
994 // Other casts only handle vector types with integer elements.
995 if (CastOpcode != Instruction::BitCast &&
996 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
997 return false;
998
999 // Only integer scalar/vector values are legal for bitwise logic operations.
1000 if (!SrcTy->getScalarType()->isIntegerTy() ||
1001 !DstTy->getScalarType()->isIntegerTy())
1002 return false;
1003
1004 // Find the constant InvC, such that castop(InvC) equals to C.
1005 PreservedCastFlags RHSFlags;
1006 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1007 if (!InvC)
1008 return false;
1009
1010 // Cost Check :
1011 // OldCost = bitlogic + cast
1012 // NewCost = bitlogic + cast
1013
1014 // Calculate specific costs for each cast with instruction context
1015 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1016 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1017
1018 InstructionCost OldCost =
1019 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1020
1021 // For new cost, we can't provide an instruction (it doesn't exist yet)
1022 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1023 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1024
1025 InstructionCost NewCost =
1026 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1027 GenericCastCost;
1028
1029 // Account for multi-use casts using specific costs
1030 if (!LHSCast->hasOneUse())
1031 NewCost += LHSCastCost;
1032
1033 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1034 << " NewCost=" << NewCost << "\n");
1035
1036 if (NewCost > OldCost)
1037 return false;
1038
1039 // Create the operation on the source type
1040 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1041 LHSSrc, InvC, I.getName() + ".inner");
1042 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1043 NewBinOp->copyIRFlags(&I);
1044
1045 Worklist.pushValue(NewOp);
1046
1047 // Create the cast operation directly to ensure we get a new instruction
1048 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1049
1050 // Preserve cast instruction flags
1051 if (RHSFlags.NNeg)
1052 NewCast->setNonNeg();
1053 if (RHSFlags.NUW)
1054 NewCast->setHasNoUnsignedWrap();
1055 if (RHSFlags.NSW)
1056 NewCast->setHasNoSignedWrap();
1057
1058 NewCast->andIRFlags(LHSCast);
1059
1060 // Insert the new instruction
1061 Value *Result = Builder.Insert(NewCast);
1062
1063 replaceValue(I, *Result);
1064 return true;
1065}
1066
1067/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1068/// destination type followed by shuffle. This can enable further transforms by
1069/// moving bitcasts or shuffles together.
1070bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1071 Value *V0, *V1;
1072 ArrayRef<int> Mask;
1073 if (!match(&I, m_BitCast(m_OneUse(
1074 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1075 return false;
1076
1077 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1078 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1079 // mask for scalable type is a splat or not.
1080 // 2) Disallow non-vector casts.
1081 // TODO: We could allow any shuffle.
1082 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1083 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1084 if (!DestTy || !SrcTy)
1085 return false;
1086
1087 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1088 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1089 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1090 return false;
1091
1092 bool IsUnary = isa<UndefValue>(V1);
1093
1094 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1095 // if it won't increase the number of bitcasts.
1096 if (!IsUnary) {
1099 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1100 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1101 return false;
1102 }
1103
1104 SmallVector<int, 16> NewMask;
1105 if (DestEltSize <= SrcEltSize) {
1106 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1107 // always be expanded to the equivalent form choosing narrower elements.
1108 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1109 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1110 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1111 } else {
1112 // The bitcast is from narrow elements to wide elements. The shuffle mask
1113 // must choose consecutive elements to allow casting first.
1114 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1115 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1116 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1117 return false;
1118 }
1119
1120 // Bitcast the shuffle src - keep its original width but using the destination
1121 // scalar type.
1122 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1123 auto *NewShuffleTy =
1124 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1125 auto *OldShuffleTy =
1126 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1127 unsigned NumOps = IsUnary ? 1 : 2;
1128
1129 // The new shuffle must not cost more than the old shuffle.
1133
1134 InstructionCost NewCost =
1135 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1136 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1137 TargetTransformInfo::CastContextHint::None,
1138 CostKind));
1139 InstructionCost OldCost =
1140 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1141 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1142 TargetTransformInfo::CastContextHint::None,
1143 CostKind);
1144
1145 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1146 << OldCost << " vs NewCost: " << NewCost << "\n");
1147
1148 if (NewCost > OldCost || !NewCost.isValid())
1149 return false;
1150
1151 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1152 ++NumShufOfBitcast;
1153 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1154 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1155 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1156 replaceValue(I, *Shuf);
1157 return true;
1158}
1159
1160/// VP Intrinsics whose vector operands are both splat values may be simplified
1161/// into the scalar version of the operation and the result splatted. This
1162/// can lead to scalarization down the line.
1163bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1164 if (!isa<VPIntrinsic>(I))
1165 return false;
1166 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1167 Value *Op0 = VPI.getArgOperand(0);
1168 Value *Op1 = VPI.getArgOperand(1);
1169
1170 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1171 return false;
1172
1173 // Check getSplatValue early in this function, to avoid doing unnecessary
1174 // work.
1175 Value *ScalarOp0 = getSplatValue(Op0);
1176 Value *ScalarOp1 = getSplatValue(Op1);
1177 if (!ScalarOp0 || !ScalarOp1)
1178 return false;
1179
1180 // For the binary VP intrinsics supported here, the result on disabled lanes
1181 // is a poison value. For now, only do this simplification if all lanes
1182 // are active.
1183 // TODO: Relax the condition that all lanes are active by using insertelement
1184 // on inactive lanes.
1185 auto IsAllTrueMask = [](Value *MaskVal) {
1186 if (Value *SplattedVal = getSplatValue(MaskVal))
1187 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1188 return ConstValue->isAllOnesValue();
1189 return false;
1190 };
1191 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1192 return false;
1193
1194 // Check to make sure we support scalarization of the intrinsic
1195 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1196 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1197 return false;
1198
1199 // Calculate cost of splatting both operands into vectors and the vector
1200 // intrinsic
1201 VectorType *VecTy = cast<VectorType>(VPI.getType());
1202 SmallVector<int> Mask;
1203 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1204 Mask.resize(FVTy->getNumElements(), 0);
1205 InstructionCost SplatCost =
1206 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1208 CostKind);
1209
1210 // Calculate the cost of the VP Intrinsic
1212 for (Value *V : VPI.args())
1213 Args.push_back(V->getType());
1214 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1215 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1216 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1217
1218 // Determine scalar opcode
1219 std::optional<unsigned> FunctionalOpcode =
1220 VPI.getFunctionalOpcode();
1221 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1222 if (!FunctionalOpcode) {
1223 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1224 if (!ScalarIntrID)
1225 return false;
1226 }
1227
1228 // Calculate cost of scalarizing
1229 InstructionCost ScalarOpCost = 0;
1230 if (ScalarIntrID) {
1231 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1232 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1233 } else {
1234 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1235 VecTy->getScalarType(), CostKind);
1236 }
1237
1238 // The existing splats may be kept around if other instructions use them.
1239 InstructionCost CostToKeepSplats =
1240 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1241 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1242
1243 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1244 << "\n");
1245 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1246 << ", Cost of scalarizing:" << NewCost << "\n");
1247
1248 // We want to scalarize unless the vector variant actually has lower cost.
1249 if (OldCost < NewCost || !NewCost.isValid())
1250 return false;
1251
1252 // Scalarize the intrinsic
1253 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1254 Value *EVL = VPI.getArgOperand(3);
1255
1256 // If the VP op might introduce UB or poison, we can scalarize it provided
1257 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1258 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1259 // scalarizing it.
1260 bool SafeToSpeculate;
1261 if (ScalarIntrID)
1262 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1263 .hasAttribute(Attribute::AttrKind::Speculatable);
1264 else
1266 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1267 if (!SafeToSpeculate &&
1268 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1269 return false;
1270
1271 Value *ScalarVal =
1272 ScalarIntrID
1273 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1274 {ScalarOp0, ScalarOp1})
1275 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1276 ScalarOp0, ScalarOp1);
1277
1278 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1279 return true;
1280}
1281
1282/// Match a vector op/compare/intrinsic with at least one
1283/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1284/// by insertelement.
1285bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1286 auto *UO = dyn_cast<UnaryOperator>(&I);
1287 auto *BO = dyn_cast<BinaryOperator>(&I);
1288 auto *CI = dyn_cast<CmpInst>(&I);
1289 auto *II = dyn_cast<IntrinsicInst>(&I);
1290 if (!UO && !BO && !CI && !II)
1291 return false;
1292
1293 // TODO: Allow intrinsics with different argument types
1294 if (II) {
1295 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1296 return false;
1297 for (auto [Idx, Arg] : enumerate(II->args()))
1298 if (Arg->getType() != II->getType() &&
1299 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1300 return false;
1301 }
1302
1303 // Do not convert the vector condition of a vector select into a scalar
1304 // condition. That may cause problems for codegen because of differences in
1305 // boolean formats and register-file transfers.
1306 // TODO: Can we account for that in the cost model?
1307 if (CI)
1308 for (User *U : I.users())
1309 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1310 return false;
1311
1312 // Match constant vectors or scalars being inserted into constant vectors:
1313 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1314 SmallVector<Value *> VecCs, ScalarOps;
1315 std::optional<uint64_t> Index;
1316
1317 auto Ops = II ? II->args() : I.operands();
1318 for (auto [OpNum, Op] : enumerate(Ops)) {
1319 Constant *VecC;
1320 Value *V;
1321 uint64_t InsIdx = 0;
1322 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1323 m_ConstantInt(InsIdx)))) {
1324 // Bail if any inserts are out of bounds.
1325 VectorType *OpTy = cast<VectorType>(Op->getType());
1326 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1327 return false;
1328 // All inserts must have the same index.
1329 // TODO: Deal with mismatched index constants and variable indexes?
1330 if (!Index)
1331 Index = InsIdx;
1332 else if (InsIdx != *Index)
1333 return false;
1334 VecCs.push_back(VecC);
1335 ScalarOps.push_back(V);
1336 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1337 OpNum, &TTI)) {
1338 VecCs.push_back(Op.get());
1339 ScalarOps.push_back(Op.get());
1340 } else if (match(Op.get(), m_Constant(VecC))) {
1341 VecCs.push_back(VecC);
1342 ScalarOps.push_back(nullptr);
1343 } else {
1344 return false;
1345 }
1346 }
1347
1348 // Bail if all operands are constant.
1349 if (!Index.has_value())
1350 return false;
1351
1352 VectorType *VecTy = cast<VectorType>(I.getType());
1353 Type *ScalarTy = VecTy->getScalarType();
1354 assert(VecTy->isVectorTy() &&
1355 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1356 ScalarTy->isPointerTy()) &&
1357 "Unexpected types for insert element into binop or cmp");
1358
1359 unsigned Opcode = I.getOpcode();
1360 InstructionCost ScalarOpCost, VectorOpCost;
1361 if (CI) {
1362 CmpInst::Predicate Pred = CI->getPredicate();
1363 ScalarOpCost = TTI.getCmpSelInstrCost(
1364 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1365 VectorOpCost = TTI.getCmpSelInstrCost(
1366 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1367 } else if (UO || BO) {
1368 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1369 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1370 } else {
1371 IntrinsicCostAttributes ScalarICA(
1372 II->getIntrinsicID(), ScalarTy,
1373 SmallVector<Type *>(II->arg_size(), ScalarTy));
1374 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1375 IntrinsicCostAttributes VectorICA(
1376 II->getIntrinsicID(), VecTy,
1377 SmallVector<Type *>(II->arg_size(), VecTy));
1378 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1379 }
1380
1381 // Fold the vector constants in the original vectors into a new base vector to
1382 // get more accurate cost modelling.
1383 Value *NewVecC = nullptr;
1384 if (CI)
1385 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1386 else if (UO)
1387 NewVecC =
1388 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1389 else if (BO)
1390 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1391 else if (II)
1392 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1393
1394 if (!NewVecC)
1395 return false;
1396
1397 // Get cost estimate for the insert element. This cost will factor into
1398 // both sequences.
1399 InstructionCost OldCost = VectorOpCost;
1400 InstructionCost NewCost =
1401 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1402 CostKind, *Index, NewVecC);
1403
1404 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1405 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1406 II->getIntrinsicID(), Idx, &TTI)))
1407 continue;
1409 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1410 OldCost += InsertCost;
1411 NewCost += !Op->hasOneUse() * InsertCost;
1412 }
1413
1414 // We want to scalarize unless the vector variant actually has lower cost.
1415 if (OldCost < NewCost || !NewCost.isValid())
1416 return false;
1417
1418 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1419 // inselt NewVecC, (scalar_op V0, V1), Index
1420 if (CI)
1421 ++NumScalarCmp;
1422 else if (UO || BO)
1423 ++NumScalarOps;
1424 else
1425 ++NumScalarIntrinsic;
1426
1427 // For constant cases, extract the scalar element, this should constant fold.
1428 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1429 if (!Scalar)
1431 cast<Constant>(VecC), Builder.getInt64(*Index));
1432
1433 Value *Scalar;
1434 if (CI)
1435 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1436 else if (UO || BO)
1437 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1438 else
1439 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1440
1441 Scalar->setName(I.getName() + ".scalar");
1442
1443 // All IR flags are safe to back-propagate. There is no potential for extra
1444 // poison to be created by the scalar instruction.
1445 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1446 ScalarInst->copyIRFlags(&I);
1447
1448 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1449 replaceValue(I, *Insert);
1450 return true;
1451}
1452
1453/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1454/// a vector into vector operations followed by extract. Note: The SLP pass
1455/// may miss this pattern because of implementation problems.
1456bool VectorCombine::foldExtractedCmps(Instruction &I) {
1457 auto *BI = dyn_cast<BinaryOperator>(&I);
1458
1459 // We are looking for a scalar binop of booleans.
1460 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1461 if (!BI || !I.getType()->isIntegerTy(1))
1462 return false;
1463
1464 // The compare predicates should match, and each compare should have a
1465 // constant operand.
1466 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1467 Instruction *I0, *I1;
1468 Constant *C0, *C1;
1469 CmpPredicate P0, P1;
1470 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1471 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1472 return false;
1473
1474 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1475 if (!MatchingPred)
1476 return false;
1477
1478 // The compare operands must be extracts of the same vector with constant
1479 // extract indexes.
1480 Value *X;
1481 uint64_t Index0, Index1;
1482 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1483 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1484 return false;
1485
1486 auto *Ext0 = cast<ExtractElementInst>(I0);
1487 auto *Ext1 = cast<ExtractElementInst>(I1);
1488 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1489 if (!ConvertToShuf)
1490 return false;
1491 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1492 "Unknown ExtractElementInst");
1493
1494 // The original scalar pattern is:
1495 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1496 CmpInst::Predicate Pred = *MatchingPred;
1497 unsigned CmpOpcode =
1498 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1499 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1500 if (!VecTy)
1501 return false;
1502
1503 InstructionCost Ext0Cost =
1504 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1505 InstructionCost Ext1Cost =
1506 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1508 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1509 CostKind);
1510
1511 InstructionCost OldCost =
1512 Ext0Cost + Ext1Cost + CmpCost * 2 +
1513 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1514
1515 // The proposed vector pattern is:
1516 // vcmp = cmp Pred X, VecC
1517 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1518 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1519 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1522 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1523 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1524 ShufMask[CheapIndex] = ExpensiveIndex;
1526 CmpTy, ShufMask, CostKind);
1527 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1528 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1529 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1530 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1531
1532 // Aggressively form vector ops if the cost is equal because the transform
1533 // may enable further optimization.
1534 // Codegen can reverse this transform (scalarize) if it was not profitable.
1535 if (OldCost < NewCost || !NewCost.isValid())
1536 return false;
1537
1538 // Create a vector constant from the 2 scalar constants.
1539 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1540 PoisonValue::get(VecTy->getElementType()));
1541 CmpC[Index0] = C0;
1542 CmpC[Index1] = C1;
1543 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1544 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1545 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1546 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1547 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1548 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1549 replaceValue(I, *NewExt);
1550 ++NumVecCmpBO;
1551 return true;
1552}
1553
1554/// Try to fold scalar selects that select between extracted elements and zero
1555/// into extracting from a vector select. This is rooted at the bitcast.
1556///
1557/// This pattern arises when a vector is bitcast to a smaller element type,
1558/// elements are extracted, and then conditionally selected with zero:
1559///
1560/// %bc = bitcast <4 x i32> %src to <16 x i8>
1561/// %e0 = extractelement <16 x i8> %bc, i32 0
1562/// %s0 = select i1 %cond, i8 %e0, i8 0
1563/// %e1 = extractelement <16 x i8> %bc, i32 1
1564/// %s1 = select i1 %cond, i8 %e1, i8 0
1565/// ...
1566///
1567/// Transforms to:
1568/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1569/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1570/// %e0 = extractelement <16 x i8> %bc, i32 0
1571/// %e1 = extractelement <16 x i8> %bc, i32 1
1572/// ...
1573///
1574/// This is profitable because vector select on wider types produces fewer
1575/// select/cndmask instructions than scalar selects on each element.
1576bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1577 auto *BC = dyn_cast<BitCastInst>(&I);
1578 if (!BC)
1579 return false;
1580
1581 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1582 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1583 if (!SrcVecTy || !DstVecTy)
1584 return false;
1585
1586 // Source must be 32-bit or 64-bit elements, destination must be smaller
1587 // integer elements. Zero in all these types is all-bits-zero.
1588 Type *SrcEltTy = SrcVecTy->getElementType();
1589 Type *DstEltTy = DstVecTy->getElementType();
1590 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1591 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1592
1593 if (SrcEltBits != 32 && SrcEltBits != 64)
1594 return false;
1595
1596 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1597 return false;
1598
1599 // Check profitability using TTI before collecting users.
1600 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1601 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1602
1603 InstructionCost ScalarSelCost =
1604 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1606 InstructionCost VecSelCost =
1607 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1609
1610 // We need at least this many selects for vectorization to be profitable.
1611 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1612 // ScalarSelCost
1613 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1614 return false;
1615
1616 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1617
1618 // Quick check: if bitcast doesn't have enough users, bail early.
1619 if (!BC->hasNUsesOrMore(MinSelects))
1620 return false;
1621
1622 // Collect all select users that match the pattern, grouped by condition.
1623 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1624 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1625
1626 for (User *U : BC->users()) {
1627 auto *Ext = dyn_cast<ExtractElementInst>(U);
1628 if (!Ext)
1629 continue;
1630
1631 for (User *ExtUser : Ext->users()) {
1632 Value *Cond;
1633 // Match: select i1 %cond, %ext, 0
1634 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1635 Cond->getType()->isIntegerTy(1))
1636 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1637 }
1638 }
1639
1640 if (CondToSelects.empty())
1641 return false;
1642
1643 bool MadeChange = false;
1644 Value *SrcVec = BC->getOperand(0);
1645
1646 // Process each group of selects with the same condition.
1647 for (auto [Cond, Selects] : CondToSelects) {
1648 // Only profitable if vector select cost < total scalar select cost.
1649 if (Selects.size() < MinSelects) {
1650 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1651 << "profitable (VecCost=" << VecSelCost
1652 << ", ScalarCost=" << ScalarSelCost
1653 << ", NumSelects=" << Selects.size() << ")\n");
1654 continue;
1655 }
1656
1657 // Create the vector select and bitcast once for this condition.
1658 auto InsertPt = std::next(BC->getIterator());
1659
1660 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1661 if (DT.dominates(BC, CondInst))
1662 InsertPt = std::next(CondInst->getIterator());
1663
1664 Builder.SetInsertPoint(InsertPt);
1665 Value *VecSel =
1666 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1667 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1668
1669 // Replace each scalar select with an extract from the new bitcast.
1670 for (SelectInst *Sel : Selects) {
1671 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1672 Value *Idx = Ext->getIndexOperand();
1673
1674 Builder.SetInsertPoint(Sel);
1675 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1676 replaceValue(*Sel, *NewExt);
1677 MadeChange = true;
1678 }
1679
1680 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1681 << " selects into vector select\n");
1682 }
1683
1684 return MadeChange;
1685}
1686
1689 const TargetTransformInfo &TTI,
1690 InstructionCost &CostBeforeReduction,
1691 InstructionCost &CostAfterReduction) {
1692 Instruction *Op0, *Op1;
1693 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1694 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1695 unsigned ReductionOpc =
1696 getArithmeticReductionInstruction(II.getIntrinsicID());
1697 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1698 bool IsUnsigned = isa<ZExtInst>(RedOp);
1699 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1700
1701 CostBeforeReduction =
1702 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1704 CostAfterReduction =
1705 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1706 ExtType, FastMathFlags(), CostKind);
1707 return;
1708 }
1709 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1710 match(RedOp,
1712 match(Op0, m_ZExtOrSExt(m_Value())) &&
1713 Op0->getOpcode() == Op1->getOpcode() &&
1714 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1715 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1716 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1717 bool IsUnsigned = isa<ZExtInst>(Op0);
1718 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1719 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1720
1721 InstructionCost ExtCost =
1722 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1724 InstructionCost MulCost =
1725 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1726 InstructionCost Ext2Cost =
1727 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1729
1730 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1731 CostAfterReduction = TTI.getMulAccReductionCost(
1732 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1733 return;
1734 }
1735 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1736 std::nullopt, CostKind);
1737}
1738
1739bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1740 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1741 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1742 if (BinOpOpc == Instruction::Sub)
1743 ReductionIID = Intrinsic::vector_reduce_add;
1744 if (ReductionIID == Intrinsic::not_intrinsic)
1745 return false;
1746
1747 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1748 Intrinsic::ID IID) -> Value * {
1749 auto *II = dyn_cast<IntrinsicInst>(V);
1750 if (!II)
1751 return nullptr;
1752 if (II->getIntrinsicID() == IID && II->hasOneUse())
1753 return II->getArgOperand(0);
1754 return nullptr;
1755 };
1756
1757 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1758 if (!V0)
1759 return false;
1760 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1761 if (!V1)
1762 return false;
1763
1764 auto *VTy = cast<VectorType>(V0->getType());
1765 if (V1->getType() != VTy)
1766 return false;
1767 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1768 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1769 unsigned ReductionOpc =
1770 getArithmeticReductionInstruction(II0.getIntrinsicID());
1771
1772 InstructionCost OldCost = 0;
1773 InstructionCost NewCost = 0;
1774 InstructionCost CostOfRedOperand0 = 0;
1775 InstructionCost CostOfRed0 = 0;
1776 InstructionCost CostOfRedOperand1 = 0;
1777 InstructionCost CostOfRed1 = 0;
1778 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1779 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1780 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1781 NewCost =
1782 CostOfRedOperand0 + CostOfRedOperand1 +
1783 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1784 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1785 if (NewCost >= OldCost || !NewCost.isValid())
1786 return false;
1787
1788 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1789 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1790 << "\n");
1791 Value *VectorBO;
1792 if (BinOpOpc == Instruction::Or)
1793 VectorBO = Builder.CreateOr(V0, V1, "",
1794 cast<PossiblyDisjointInst>(I).isDisjoint());
1795 else
1796 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1797
1798 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1799 replaceValue(I, *Rdx);
1800 return true;
1801}
1802
1803// Check if memory loc modified between two instrs in the same BB
1806 const MemoryLocation &Loc, AAResults &AA) {
1807 unsigned NumScanned = 0;
1808 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1809 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1810 ++NumScanned > MaxInstrsToScan;
1811 });
1812}
1813
1814namespace {
1815/// Helper class to indicate whether a vector index can be safely scalarized and
1816/// if a freeze needs to be inserted.
1817class ScalarizationResult {
1818 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1819
1820 StatusTy Status;
1821 Value *ToFreeze;
1822
1823 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1824 : Status(Status), ToFreeze(ToFreeze) {}
1825
1826public:
1827 ScalarizationResult(const ScalarizationResult &Other) = default;
1828 ~ScalarizationResult() {
1829 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1830 }
1831
1832 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1833 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1834 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1835 return {StatusTy::SafeWithFreeze, ToFreeze};
1836 }
1837
1838 /// Returns true if the index can be scalarize without requiring a freeze.
1839 bool isSafe() const { return Status == StatusTy::Safe; }
1840 /// Returns true if the index cannot be scalarized.
1841 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1842 /// Returns true if the index can be scalarize, but requires inserting a
1843 /// freeze.
1844 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1845
1846 /// Reset the state of Unsafe and clear ToFreze if set.
1847 void discard() {
1848 ToFreeze = nullptr;
1849 Status = StatusTy::Unsafe;
1850 }
1851
1852 /// Freeze the ToFreeze and update the use in \p User to use it.
1853 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1854 assert(isSafeWithFreeze() &&
1855 "should only be used when freezing is required");
1856 assert(is_contained(ToFreeze->users(), &UserI) &&
1857 "UserI must be a user of ToFreeze");
1858 IRBuilder<>::InsertPointGuard Guard(Builder);
1859 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1860 Value *Frozen =
1861 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1862 for (Use &U : make_early_inc_range((UserI.operands())))
1863 if (U.get() == ToFreeze)
1864 U.set(Frozen);
1865
1866 ToFreeze = nullptr;
1867 }
1868};
1869} // namespace
1870
1871/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1872/// Idx. \p Idx must access a valid vector element.
1873static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1874 Instruction *CtxI,
1875 AssumptionCache &AC,
1876 const DominatorTree &DT) {
1877 // We do checks for both fixed vector types and scalable vector types.
1878 // This is the number of elements of fixed vector types,
1879 // or the minimum number of elements of scalable vector types.
1880 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1881 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1882
1883 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1884 if (C->getValue().ult(NumElements))
1885 return ScalarizationResult::safe();
1886 return ScalarizationResult::unsafe();
1887 }
1888
1889 // Always unsafe if the index type can't handle all inbound values.
1890 if (!llvm::isUIntN(IntWidth, NumElements))
1891 return ScalarizationResult::unsafe();
1892
1893 APInt Zero(IntWidth, 0);
1894 APInt MaxElts(IntWidth, NumElements);
1895 ConstantRange ValidIndices(Zero, MaxElts);
1896 ConstantRange IdxRange(IntWidth, true);
1897
1898 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1899 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1900 true, &AC, CtxI, &DT)))
1901 return ScalarizationResult::safe();
1902 return ScalarizationResult::unsafe();
1903 }
1904
1905 // If the index may be poison, check if we can insert a freeze before the
1906 // range of the index is restricted.
1907 Value *IdxBase;
1908 ConstantInt *CI;
1909 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1910 IdxRange = IdxRange.binaryAnd(CI->getValue());
1911 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1912 IdxRange = IdxRange.urem(CI->getValue());
1913 }
1914
1915 if (ValidIndices.contains(IdxRange))
1916 return ScalarizationResult::safeWithFreeze(IdxBase);
1917 return ScalarizationResult::unsafe();
1918}
1919
1920/// The memory operation on a vector of \p ScalarType had alignment of
1921/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1922/// alignment that will be valid for the memory operation on a single scalar
1923/// element of the same type with index \p Idx.
1925 Type *ScalarType, Value *Idx,
1926 const DataLayout &DL) {
1927 if (auto *C = dyn_cast<ConstantInt>(Idx))
1928 return commonAlignment(VectorAlignment,
1929 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1930 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1931}
1932
1933// Combine patterns like:
1934// %0 = load <4 x i32>, <4 x i32>* %a
1935// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1936// store <4 x i32> %1, <4 x i32>* %a
1937// to:
1938// %0 = bitcast <4 x i32>* %a to i32*
1939// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1940// store i32 %b, i32* %1
1941bool VectorCombine::foldSingleElementStore(Instruction &I) {
1943 return false;
1944 auto *SI = cast<StoreInst>(&I);
1945 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1946 return false;
1947
1948 // TODO: Combine more complicated patterns (multiple insert) by referencing
1949 // TargetTransformInfo.
1951 Value *NewElement;
1952 Value *Idx;
1953 if (!match(SI->getValueOperand(),
1954 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1955 m_Value(Idx))))
1956 return false;
1957
1958 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1959 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1960 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1961 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1962 // modified between, vector type matches store size, and index is inbounds.
1963 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1964 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1965 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1966 return false;
1967
1968 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1969 if (ScalarizableIdx.isUnsafe() ||
1970 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1971 MemoryLocation::get(SI), AA))
1972 return false;
1973
1974 // Ensure we add the load back to the worklist BEFORE its users so they can
1975 // erased in the correct order.
1976 Worklist.push(Load);
1977
1978 if (ScalarizableIdx.isSafeWithFreeze())
1979 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1980 Value *GEP = Builder.CreateInBoundsGEP(
1981 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1982 {ConstantInt::get(Idx->getType(), 0), Idx});
1983 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1984 NSI->copyMetadata(*SI);
1985 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1986 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1987 *DL);
1988 NSI->setAlignment(ScalarOpAlignment);
1989 replaceValue(I, *NSI);
1991 return true;
1992 }
1993
1994 return false;
1995}
1996
1997/// Try to scalarize vector loads feeding extractelement or bitcast
1998/// instructions.
1999bool VectorCombine::scalarizeLoad(Instruction &I) {
2000 Value *Ptr;
2001 if (!match(&I, m_Load(m_Value(Ptr))))
2002 return false;
2003
2004 auto *LI = cast<LoadInst>(&I);
2005 auto *VecTy = cast<VectorType>(LI->getType());
2006 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2007 return false;
2008
2009 bool AllExtracts = true;
2010 bool AllBitcasts = true;
2011 Instruction *LastCheckedInst = LI;
2012 unsigned NumInstChecked = 0;
2013
2014 // Check what type of users we have (must either all be extracts or
2015 // bitcasts) and ensure no memory modifications between the load and
2016 // its users.
2017 for (User *U : LI->users()) {
2018 auto *UI = dyn_cast<Instruction>(U);
2019 if (!UI || UI->getParent() != LI->getParent())
2020 return false;
2021
2022 // If any user is waiting to be erased, then bail out as this will
2023 // distort the cost calculation and possibly lead to infinite loops.
2024 if (UI->use_empty())
2025 return false;
2026
2027 if (!isa<ExtractElementInst>(UI))
2028 AllExtracts = false;
2029 if (!isa<BitCastInst>(UI))
2030 AllBitcasts = false;
2031
2032 // Check if any instruction between the load and the user may modify memory.
2033 if (LastCheckedInst->comesBefore(UI)) {
2034 for (Instruction &I :
2035 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2036 // Bail out if we reached the check limit or the instruction may write
2037 // to memory.
2038 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2039 return false;
2040 NumInstChecked++;
2041 }
2042 LastCheckedInst = UI;
2043 }
2044 }
2045
2046 if (AllExtracts)
2047 return scalarizeLoadExtract(LI, VecTy, Ptr);
2048 if (AllBitcasts)
2049 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2050 return false;
2051}
2052
2053/// Try to scalarize vector loads feeding extractelement instructions.
2054bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2055 Value *Ptr) {
2057 return false;
2058
2059 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2060 llvm::scope_exit FailureGuard([&]() {
2061 // If the transform is aborted, discard the ScalarizationResults.
2062 for (auto &Pair : NeedFreeze)
2063 Pair.second.discard();
2064 });
2065
2066 InstructionCost OriginalCost =
2067 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2069 InstructionCost ScalarizedCost = 0;
2070
2071 for (User *U : LI->users()) {
2072 auto *UI = cast<ExtractElementInst>(U);
2073
2074 auto ScalarIdx =
2075 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
2076 if (ScalarIdx.isUnsafe())
2077 return false;
2078 if (ScalarIdx.isSafeWithFreeze()) {
2079 NeedFreeze.try_emplace(UI, ScalarIdx);
2080 ScalarIdx.discard();
2081 }
2082
2083 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2084 OriginalCost +=
2085 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2086 Index ? Index->getZExtValue() : -1);
2087 ScalarizedCost +=
2088 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2090 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2091 nullptr, nullptr, CostKind);
2092 }
2093
2094 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2095 << "\n LoadExtractCost: " << OriginalCost
2096 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2097
2098 if (ScalarizedCost >= OriginalCost)
2099 return false;
2100
2101 // Ensure we add the load back to the worklist BEFORE its users so they can
2102 // erased in the correct order.
2103 Worklist.push(LI);
2104
2105 Type *ElemType = VecTy->getElementType();
2106
2107 // Replace extracts with narrow scalar loads.
2108 for (User *U : LI->users()) {
2109 auto *EI = cast<ExtractElementInst>(U);
2110 Value *Idx = EI->getIndexOperand();
2111
2112 // Insert 'freeze' for poison indexes.
2113 auto It = NeedFreeze.find(EI);
2114 if (It != NeedFreeze.end())
2115 It->second.freeze(Builder, *cast<Instruction>(Idx));
2116
2117 Builder.SetInsertPoint(EI);
2118 Value *GEP =
2119 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2120 auto *NewLoad = cast<LoadInst>(
2121 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2122
2123 Align ScalarOpAlignment =
2124 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2125 NewLoad->setAlignment(ScalarOpAlignment);
2126
2127 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2128 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2129 AAMDNodes OldAAMD = LI->getAAMetadata();
2130 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2131 }
2132
2133 replaceValue(*EI, *NewLoad, false);
2134 }
2135
2136 FailureGuard.release();
2137 return true;
2138}
2139
2140/// Try to scalarize vector loads feeding bitcast instructions.
2141bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2142 Value *Ptr) {
2143 InstructionCost OriginalCost =
2144 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2146
2147 Type *TargetScalarType = nullptr;
2148 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2149
2150 for (User *U : LI->users()) {
2151 auto *BC = cast<BitCastInst>(U);
2152
2153 Type *DestTy = BC->getDestTy();
2154 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2155 return false;
2156
2157 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2158 if (DestBitWidth != VecBitWidth)
2159 return false;
2160
2161 // All bitcasts must target the same scalar type.
2162 if (!TargetScalarType)
2163 TargetScalarType = DestTy;
2164 else if (TargetScalarType != DestTy)
2165 return false;
2166
2167 OriginalCost +=
2168 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2170 }
2171
2172 if (!TargetScalarType)
2173 return false;
2174
2175 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2176 InstructionCost ScalarizedCost =
2177 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2179
2180 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2181 << "\n OriginalCost: " << OriginalCost
2182 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2183
2184 if (ScalarizedCost >= OriginalCost)
2185 return false;
2186
2187 // Ensure we add the load back to the worklist BEFORE its users so they can
2188 // erased in the correct order.
2189 Worklist.push(LI);
2190
2191 Builder.SetInsertPoint(LI);
2192 auto *ScalarLoad =
2193 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2194 ScalarLoad->setAlignment(LI->getAlign());
2195 ScalarLoad->copyMetadata(*LI);
2196
2197 // Replace all bitcast users with the scalar load.
2198 for (User *U : LI->users()) {
2199 auto *BC = cast<BitCastInst>(U);
2200 replaceValue(*BC, *ScalarLoad, false);
2201 }
2202
2203 return true;
2204}
2205
2206bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2208 return false;
2209 auto *Ext = dyn_cast<ZExtInst>(&I);
2210 if (!Ext)
2211 return false;
2212
2213 // Try to convert a vector zext feeding only extracts to a set of scalar
2214 // (Src << ExtIdx *Size) & (Size -1)
2215 // if profitable .
2216 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2217 if (!SrcTy)
2218 return false;
2219 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2220
2221 Type *ScalarDstTy = DstTy->getElementType();
2222 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2223 return false;
2224
2225 InstructionCost VectorCost =
2226 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2228 unsigned ExtCnt = 0;
2229 bool ExtLane0 = false;
2230 for (User *U : Ext->users()) {
2231 uint64_t Idx;
2232 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2233 return false;
2234 if (cast<Instruction>(U)->use_empty())
2235 continue;
2236 ExtCnt += 1;
2237 ExtLane0 |= !Idx;
2238 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2239 CostKind, Idx, U);
2240 }
2241
2242 InstructionCost ScalarCost =
2243 ExtCnt * TTI.getArithmeticInstrCost(
2244 Instruction::And, ScalarDstTy, CostKind,
2247 (ExtCnt - ExtLane0) *
2249 Instruction::LShr, ScalarDstTy, CostKind,
2252 if (ScalarCost > VectorCost)
2253 return false;
2254
2255 Value *ScalarV = Ext->getOperand(0);
2256 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2257 &DT)) {
2258 // Check wether all lanes are extracted, all extracts trigger UB
2259 // on poison, and the last extract (and hence all previous ones)
2260 // are guaranteed to execute if Ext executes. If so, we do not
2261 // need to insert a freeze.
2262 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2263 bool AllExtractsTriggerUB = true;
2264 ExtractElementInst *LastExtract = nullptr;
2265 BasicBlock *ExtBB = Ext->getParent();
2266 for (User *U : Ext->users()) {
2267 auto *Extract = cast<ExtractElementInst>(U);
2268 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2269 AllExtractsTriggerUB = false;
2270 break;
2271 }
2272 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2273 if (!LastExtract || LastExtract->comesBefore(Extract))
2274 LastExtract = Extract;
2275 }
2276 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2277 !AllExtractsTriggerUB ||
2279 LastExtract->getIterator()))
2280 ScalarV = Builder.CreateFreeze(ScalarV);
2281 }
2282 ScalarV = Builder.CreateBitCast(
2283 ScalarV,
2284 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2285 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2286 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2287 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2288 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2289 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2290 for (User *U : Ext->users()) {
2291 auto *Extract = cast<ExtractElementInst>(U);
2292 uint64_t Idx =
2293 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2294 uint64_t ShiftAmt =
2295 DL->isBigEndian()
2296 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2297 : (Idx * SrcEltSizeInBits);
2298 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2299 Value *And = Builder.CreateAnd(LShr, Mask);
2300 U->replaceAllUsesWith(And);
2301 }
2302 return true;
2303}
2304
2305/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2306/// to "(bitcast (concat X, Y))"
2307/// where X/Y are bitcasted from i1 mask vectors.
2308bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2309 Type *Ty = I.getType();
2310 if (!Ty->isIntegerTy())
2311 return false;
2312
2313 // TODO: Add big endian test coverage
2314 if (DL->isBigEndian())
2315 return false;
2316
2317 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2318 Instruction *X, *Y;
2320 return false;
2321
2322 // Allow both sources to contain shl, to handle more generic pattern:
2323 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2324 Value *SrcX;
2325 uint64_t ShAmtX = 0;
2326 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2327 !match(X, m_OneUse(
2329 m_ConstantInt(ShAmtX)))))
2330 return false;
2331
2332 Value *SrcY;
2333 uint64_t ShAmtY = 0;
2334 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2335 !match(Y, m_OneUse(
2337 m_ConstantInt(ShAmtY)))))
2338 return false;
2339
2340 // Canonicalize larger shift to the RHS.
2341 if (ShAmtX > ShAmtY) {
2342 std::swap(X, Y);
2343 std::swap(SrcX, SrcY);
2344 std::swap(ShAmtX, ShAmtY);
2345 }
2346
2347 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2348 // difference is the mask width so they can be easily concatenated together.
2349 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2350 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2351 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2352 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2353 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2354 !MaskTy->getElementType()->isIntegerTy(1) ||
2355 MaskTy->getNumElements() != ShAmtDiff ||
2356 MaskTy->getNumElements() > (BitWidth / 2))
2357 return false;
2358
2359 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2360 auto *ConcatIntTy =
2361 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2362 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2363
2364 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2365 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2366
2367 // TODO: Is it worth supporting multi use cases?
2368 InstructionCost OldCost = 0;
2369 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2370 OldCost +=
2371 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2372 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2374 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2376
2377 InstructionCost NewCost = 0;
2379 MaskTy, ConcatMask, CostKind);
2380 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2382 if (Ty != ConcatIntTy)
2383 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2385 if (ShAmtX > 0)
2386 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2387
2388 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2389 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2390 << "\n");
2391
2392 if (NewCost > OldCost)
2393 return false;
2394
2395 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2396 // any residual zero-extension or shifting.
2397 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2398 Worklist.pushValue(Concat);
2399
2400 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2401
2402 if (Ty != ConcatIntTy) {
2403 Worklist.pushValue(Result);
2404 Result = Builder.CreateZExt(Result, Ty);
2405 }
2406
2407 if (ShAmtX > 0) {
2408 Worklist.pushValue(Result);
2409 Result = Builder.CreateShl(Result, ShAmtX);
2410 }
2411
2412 replaceValue(I, *Result);
2413 return true;
2414}
2415
2416/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2417/// --> "binop (shuffle), (shuffle)".
2418bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2419 BinaryOperator *BinOp;
2420 ArrayRef<int> OuterMask;
2421 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2422 return false;
2423
2424 // Don't introduce poison into div/rem.
2425 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2426 return false;
2427
2428 Value *Op00, *Op01, *Op10, *Op11;
2429 ArrayRef<int> Mask0, Mask1;
2430 bool Match0 = match(BinOp->getOperand(0),
2431 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2432 bool Match1 = match(BinOp->getOperand(1),
2433 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2434 if (!Match0 && !Match1)
2435 return false;
2436
2437 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2438 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2439 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2440 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2441
2442 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2443 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2444 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2445 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2446 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2447 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2448 return false;
2449
2450 unsigned NumSrcElts = BinOpTy->getNumElements();
2451
2452 // Don't accept shuffles that reference the second operand in
2453 // div/rem or if its an undef arg.
2454 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2455 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2456 return false;
2457
2458 // Merge outer / inner (or identity if no match) shuffles.
2459 SmallVector<int> NewMask0, NewMask1;
2460 for (int M : OuterMask) {
2461 if (M < 0 || M >= (int)NumSrcElts) {
2462 NewMask0.push_back(PoisonMaskElem);
2463 NewMask1.push_back(PoisonMaskElem);
2464 } else {
2465 NewMask0.push_back(Match0 ? Mask0[M] : M);
2466 NewMask1.push_back(Match1 ? Mask1[M] : M);
2467 }
2468 }
2469
2470 unsigned NumOpElts = Op0Ty->getNumElements();
2471 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2472 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2473 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2474 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2475 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2476 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2477
2478 InstructionCost NewCost = 0;
2479 // Try to merge shuffles across the binop if the new shuffles are not costly.
2480 InstructionCost BinOpCost =
2481 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2482 InstructionCost OldCost =
2484 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2485 0, nullptr, {BinOp}, &I);
2486 if (!BinOp->hasOneUse())
2487 NewCost += BinOpCost;
2488
2489 if (Match0) {
2491 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2492 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2493 OldCost += Shuf0Cost;
2494 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2495 NewCost += Shuf0Cost;
2496 }
2497 if (Match1) {
2499 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2500 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2501 OldCost += Shuf1Cost;
2502 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2503 NewCost += Shuf1Cost;
2504 }
2505
2506 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2507
2508 if (!IsIdentity0)
2509 NewCost +=
2511 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2512 if (!IsIdentity1)
2513 NewCost +=
2515 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2516
2517 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2518 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2519 << "\n");
2520
2521 // If costs are equal, still fold as we reduce instruction count.
2522 if (NewCost > OldCost)
2523 return false;
2524
2525 Value *LHS =
2526 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2527 Value *RHS =
2528 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2529 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2530
2531 // Intersect flags from the old binops.
2532 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2533 NewInst->copyIRFlags(BinOp);
2534
2535 Worklist.pushValue(LHS);
2536 Worklist.pushValue(RHS);
2537 replaceValue(I, *NewBO);
2538 return true;
2539}
2540
2541/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2542/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2543bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2544 ArrayRef<int> OldMask;
2545 Instruction *LHS, *RHS;
2547 m_Mask(OldMask))))
2548 return false;
2549
2550 // TODO: Add support for addlike etc.
2551 if (LHS->getOpcode() != RHS->getOpcode())
2552 return false;
2553
2554 Value *X, *Y, *Z, *W;
2555 bool IsCommutative = false;
2556 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2557 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2558 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2559 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2560 auto *BO = cast<BinaryOperator>(LHS);
2561 // Don't introduce poison into div/rem.
2562 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2563 return false;
2564 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2565 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2566 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2567 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2568 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2569 } else
2570 return false;
2571
2572 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2573 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2574 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2575 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2576 return false;
2577
2578 bool SameBinOp = LHS == RHS;
2579 unsigned NumSrcElts = BinOpTy->getNumElements();
2580
2581 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2582 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2583 std::swap(X, Y);
2584
2585 auto ConvertToUnary = [NumSrcElts](int &M) {
2586 if (M >= (int)NumSrcElts)
2587 M -= NumSrcElts;
2588 };
2589
2590 SmallVector<int> NewMask0(OldMask);
2592 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2593 if (X == Z) {
2594 llvm::for_each(NewMask0, ConvertToUnary);
2596 Z = PoisonValue::get(BinOpTy);
2597 }
2598
2599 SmallVector<int> NewMask1(OldMask);
2601 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2602 if (Y == W) {
2603 llvm::for_each(NewMask1, ConvertToUnary);
2605 W = PoisonValue::get(BinOpTy);
2606 }
2607
2608 // Try to replace a binop with a shuffle if the shuffle is not costly.
2609 // When SameBinOp, only count the binop cost once.
2612
2613 InstructionCost OldCost = LHSCost;
2614 if (!SameBinOp) {
2615 OldCost += RHSCost;
2616 }
2618 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2619 nullptr, {LHS, RHS}, &I);
2620
2621 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2622 // where one use shuffles have gotten split across the binop/cmp. These
2623 // often allow a major reduction in total cost that wouldn't happen as
2624 // individual folds.
2625 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2626 TTI::TargetCostKind CostKind) -> bool {
2627 Value *InnerOp;
2628 ArrayRef<int> InnerMask;
2629 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2630 m_Mask(InnerMask)))) &&
2631 InnerOp->getType() == Op->getType() &&
2632 all_of(InnerMask,
2633 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2634 for (int &M : Mask)
2635 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2636 M = InnerMask[M - Offset];
2637 M = 0 <= M ? M + Offset : M;
2638 }
2640 Op = InnerOp;
2641 return true;
2642 }
2643 return false;
2644 };
2645 bool ReducedInstCount = false;
2646 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2647 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2648 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2649 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2650 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2651 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2652 // original binop(s). If binops have multiple uses, they won't be eliminated.
2653 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2654
2655 auto *ShuffleCmpTy =
2656 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2658 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2659 if (!SingleSrcBinOp)
2660 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2661 CostKind, 0, nullptr, {Y, W});
2662
2663 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2664 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2665 CostKind, Op0Info, Op1Info);
2666 } else {
2667 NewCost +=
2668 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2669 PredLHS, CostKind, Op0Info, Op1Info);
2670 }
2671 // If LHS/RHS have other uses, we need to account for the cost of keeping
2672 // the original instructions. When SameBinOp, only add the cost once.
2673 if (!LHS->hasOneUser())
2674 NewCost += LHSCost;
2675 if (!SameBinOp && !RHS->hasOneUser())
2676 NewCost += RHSCost;
2677
2678 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2679 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2680 << "\n");
2681
2682 // If either shuffle will constant fold away, then fold for the same cost as
2683 // we will reduce the instruction count.
2684 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2685 (isa<Constant>(Y) && isa<Constant>(W));
2686 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2687 return false;
2688
2689 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2690 Value *Shuf1 =
2691 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2692 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2693 ? Builder.CreateBinOp(
2694 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2695 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2696
2697 // Intersect flags from the old binops.
2698 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2699 NewInst->copyIRFlags(LHS);
2700 NewInst->andIRFlags(RHS);
2701 }
2702
2703 Worklist.pushValue(Shuf0);
2704 Worklist.pushValue(Shuf1);
2705 replaceValue(I, *NewBO);
2706 return true;
2707}
2708
2709/// Try to convert,
2710/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2711/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2712bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2713 ArrayRef<int> Mask;
2714 Value *C1, *T1, *F1, *C2, *T2, *F2;
2715 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2716 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2717 m_Mask(Mask))))
2718 return false;
2719
2720 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2721 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2722
2723 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2724 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2725 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2726 return false;
2727
2728 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2729 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2730 // SelectInsts must have the same FMF.
2731 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2732 ((SI0FOp != nullptr) &&
2733 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2734 return false;
2735
2736 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2737 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2739 auto SelOp = Instruction::Select;
2740
2742 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2744 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2745
2746 InstructionCost OldCost =
2747 CostSel1 + CostSel2 +
2748 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2749 {I.getOperand(0), I.getOperand(1)}, &I);
2750
2752 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2753 Mask, CostKind, 0, nullptr, {C1, C2});
2754 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2755 nullptr, {T1, T2});
2756 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2757 nullptr, {F1, F2});
2758 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2759 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2760 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2762
2763 if (!Sel1->hasOneUse())
2764 NewCost += CostSel1;
2765 if (!Sel2->hasOneUse())
2766 NewCost += CostSel2;
2767
2768 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2769 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2770 << "\n");
2771 if (NewCost > OldCost)
2772 return false;
2773
2774 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2775 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2776 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2777 Value *NewSel;
2778 // We presuppose that the SelectInsts have the same FMF.
2779 if (SI0FOp)
2780 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2781 SI0FOp->getFastMathFlags());
2782 else
2783 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2784
2785 Worklist.pushValue(ShuffleCmp);
2786 Worklist.pushValue(ShuffleTrue);
2787 Worklist.pushValue(ShuffleFalse);
2788 replaceValue(I, *NewSel);
2789 return true;
2790}
2791
2792/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2793/// into "castop (shuffle)".
2794bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2795 Value *V0, *V1;
2796 ArrayRef<int> OldMask;
2797 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2798 return false;
2799
2800 // Check whether this is a binary shuffle.
2801 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2802
2803 auto *C0 = dyn_cast<CastInst>(V0);
2804 auto *C1 = dyn_cast<CastInst>(V1);
2805 if (!C0 || (IsBinaryShuffle && !C1))
2806 return false;
2807
2808 Instruction::CastOps Opcode = C0->getOpcode();
2809
2810 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2811 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2812 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2813 return false;
2814
2815 if (IsBinaryShuffle) {
2816 if (C0->getSrcTy() != C1->getSrcTy())
2817 return false;
2818 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2819 if (Opcode != C1->getOpcode()) {
2820 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2821 Opcode = Instruction::SExt;
2822 else
2823 return false;
2824 }
2825 }
2826
2827 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2828 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2829 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2830 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2831 return false;
2832
2833 unsigned NumSrcElts = CastSrcTy->getNumElements();
2834 unsigned NumDstElts = CastDstTy->getNumElements();
2835 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2836 "Only bitcasts expected to alter src/dst element counts");
2837
2838 // Check for bitcasting of unscalable vector types.
2839 // e.g. <32 x i40> -> <40 x i32>
2840 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2841 (NumDstElts % NumSrcElts) != 0)
2842 return false;
2843
2844 SmallVector<int, 16> NewMask;
2845 if (NumSrcElts >= NumDstElts) {
2846 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2847 // always be expanded to the equivalent form choosing narrower elements.
2848 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2849 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2850 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2851 } else {
2852 // The bitcast is from narrow elements to wide elements. The shuffle mask
2853 // must choose consecutive elements to allow casting first.
2854 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2855 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2856 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2857 return false;
2858 }
2859
2860 auto *NewShuffleDstTy =
2861 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2862
2863 // Try to replace a castop with a shuffle if the shuffle is not costly.
2864 InstructionCost CostC0 =
2865 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2867
2869 if (IsBinaryShuffle)
2871 else
2873
2874 InstructionCost OldCost = CostC0;
2875 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2876 CostKind, 0, nullptr, {}, &I);
2877
2878 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2879 CastSrcTy, NewMask, CostKind);
2880 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2882 if (!C0->hasOneUse())
2883 NewCost += CostC0;
2884 if (IsBinaryShuffle) {
2885 InstructionCost CostC1 =
2886 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2888 OldCost += CostC1;
2889 if (!C1->hasOneUse())
2890 NewCost += CostC1;
2891 }
2892
2893 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2894 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2895 << "\n");
2896 if (NewCost > OldCost)
2897 return false;
2898
2899 Value *Shuf;
2900 if (IsBinaryShuffle)
2901 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2902 NewMask);
2903 else
2904 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2905
2906 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2907
2908 // Intersect flags from the old casts.
2909 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2910 NewInst->copyIRFlags(C0);
2911 if (IsBinaryShuffle)
2912 NewInst->andIRFlags(C1);
2913 }
2914
2915 Worklist.pushValue(Shuf);
2916 replaceValue(I, *Cast);
2917 return true;
2918}
2919
2920/// Try to convert any of:
2921/// "shuffle (shuffle x, y), (shuffle y, x)"
2922/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2923/// "shuffle (shuffle x, undef), y"
2924/// "shuffle x, (shuffle y, undef)"
2925/// into "shuffle x, y".
2926bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2927 ArrayRef<int> OuterMask;
2928 Value *OuterV0, *OuterV1;
2929 if (!match(&I,
2930 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2931 return false;
2932
2933 ArrayRef<int> InnerMask0, InnerMask1;
2934 Value *X0, *X1, *Y0, *Y1;
2935 bool Match0 =
2936 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2937 bool Match1 =
2938 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2939 if (!Match0 && !Match1)
2940 return false;
2941
2942 // If the outer shuffle is a permute, then create a fake inner all-poison
2943 // shuffle. This is easier than accounting for length-changing shuffles below.
2944 SmallVector<int, 16> PoisonMask1;
2945 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2946 X1 = X0;
2947 Y1 = Y0;
2948 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2949 InnerMask1 = PoisonMask1;
2950 Match1 = true; // fake match
2951 }
2952
2953 X0 = Match0 ? X0 : OuterV0;
2954 Y0 = Match0 ? Y0 : OuterV0;
2955 X1 = Match1 ? X1 : OuterV1;
2956 Y1 = Match1 ? Y1 : OuterV1;
2957 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2958 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2959 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2960 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2961 X0->getType() != X1->getType())
2962 return false;
2963
2964 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2965 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2966
2967 // Attempt to merge shuffles, matching upto 2 source operands.
2968 // Replace index to a poison arg with PoisonMaskElem.
2969 // Bail if either inner masks reference an undef arg.
2970 SmallVector<int, 16> NewMask(OuterMask);
2971 Value *NewX = nullptr, *NewY = nullptr;
2972 for (int &M : NewMask) {
2973 Value *Src = nullptr;
2974 if (0 <= M && M < (int)NumImmElts) {
2975 Src = OuterV0;
2976 if (Match0) {
2977 M = InnerMask0[M];
2978 Src = M >= (int)NumSrcElts ? Y0 : X0;
2979 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2980 }
2981 } else if (M >= (int)NumImmElts) {
2982 Src = OuterV1;
2983 M -= NumImmElts;
2984 if (Match1) {
2985 M = InnerMask1[M];
2986 Src = M >= (int)NumSrcElts ? Y1 : X1;
2987 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2988 }
2989 }
2990 if (Src && M != PoisonMaskElem) {
2991 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2992 if (isa<UndefValue>(Src)) {
2993 // We've referenced an undef element - if its poison, update the shuffle
2994 // mask, else bail.
2995 if (!isa<PoisonValue>(Src))
2996 return false;
2997 M = PoisonMaskElem;
2998 continue;
2999 }
3000 if (!NewX || NewX == Src) {
3001 NewX = Src;
3002 continue;
3003 }
3004 if (!NewY || NewY == Src) {
3005 M += NumSrcElts;
3006 NewY = Src;
3007 continue;
3008 }
3009 return false;
3010 }
3011 }
3012
3013 if (!NewX)
3014 return PoisonValue::get(ShuffleDstTy);
3015 if (!NewY)
3016 NewY = PoisonValue::get(ShuffleSrcTy);
3017
3018 // Have we folded to an Identity shuffle?
3019 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3020 replaceValue(I, *NewX);
3021 return true;
3022 }
3023
3024 // Try to merge the shuffles if the new shuffle is not costly.
3025 InstructionCost InnerCost0 = 0;
3026 if (Match0)
3027 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3028
3029 InstructionCost InnerCost1 = 0;
3030 if (Match1)
3031 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3032
3034
3035 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3036
3037 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3041 InstructionCost NewCost =
3042 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3043 nullptr, {NewX, NewY});
3044 if (!OuterV0->hasOneUse())
3045 NewCost += InnerCost0;
3046 if (!OuterV1->hasOneUse())
3047 NewCost += InnerCost1;
3048
3049 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3050 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3051 << "\n");
3052 if (NewCost > OldCost)
3053 return false;
3054
3055 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3056 replaceValue(I, *Shuf);
3057 return true;
3058}
3059
3060/// Try to convert a chain of length-preserving shuffles that are fed by
3061/// length-changing shuffles from the same source, e.g. a chain of length 3:
3062///
3063/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3064/// (shuffle y, undef)),
3065// (shuffle y, undef)"
3066///
3067/// into a single shuffle fed by a length-changing shuffle:
3068///
3069/// "shuffle x, (shuffle y, undef)"
3070///
3071/// Such chains arise e.g. from folding extract/insert sequences.
3072bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3073 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3074 if (!TrunkType)
3075 return false;
3076
3077 unsigned ChainLength = 0;
3078 SmallVector<int> Mask;
3079 SmallVector<int> YMask;
3080 InstructionCost OldCost = 0;
3081 InstructionCost NewCost = 0;
3082 Value *Trunk = &I;
3083 unsigned NumTrunkElts = TrunkType->getNumElements();
3084 Value *Y = nullptr;
3085
3086 for (;;) {
3087 // Match the current trunk against (commutations of) the pattern
3088 // "shuffle trunk', (shuffle y, undef)"
3089 ArrayRef<int> OuterMask;
3090 Value *OuterV0, *OuterV1;
3091 if (ChainLength != 0 && !Trunk->hasOneUse())
3092 break;
3093 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3094 m_Mask(OuterMask))))
3095 break;
3096 if (OuterV0->getType() != TrunkType) {
3097 // This shuffle is not length-preserving, so it cannot be part of the
3098 // chain.
3099 break;
3100 }
3101
3102 ArrayRef<int> InnerMask0, InnerMask1;
3103 Value *A0, *A1, *B0, *B1;
3104 bool Match0 =
3105 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3106 bool Match1 =
3107 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3108 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3109 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3110 if (Match0Leaf == Match1Leaf) {
3111 // Only handle the case of exactly one leaf in each step. The "two leaves"
3112 // case is handled by foldShuffleOfShuffles.
3113 break;
3114 }
3115
3116 SmallVector<int> CommutedOuterMask;
3117 if (Match0Leaf) {
3118 std::swap(OuterV0, OuterV1);
3119 std::swap(InnerMask0, InnerMask1);
3120 std::swap(A0, A1);
3121 std::swap(B0, B1);
3122 llvm::append_range(CommutedOuterMask, OuterMask);
3123 for (int &M : CommutedOuterMask) {
3124 if (M == PoisonMaskElem)
3125 continue;
3126 if (M < (int)NumTrunkElts)
3127 M += NumTrunkElts;
3128 else
3129 M -= NumTrunkElts;
3130 }
3131 OuterMask = CommutedOuterMask;
3132 }
3133 if (!OuterV1->hasOneUse())
3134 break;
3135
3136 if (!isa<UndefValue>(A1)) {
3137 if (!Y)
3138 Y = A1;
3139 else if (Y != A1)
3140 break;
3141 }
3142 if (!isa<UndefValue>(B1)) {
3143 if (!Y)
3144 Y = B1;
3145 else if (Y != B1)
3146 break;
3147 }
3148
3149 auto *YType = cast<FixedVectorType>(A1->getType());
3150 int NumLeafElts = YType->getNumElements();
3151 SmallVector<int> LocalYMask(InnerMask1);
3152 for (int &M : LocalYMask) {
3153 if (M >= NumLeafElts)
3154 M -= NumLeafElts;
3155 }
3156
3157 InstructionCost LocalOldCost =
3160
3161 // Handle the initial (start of chain) case.
3162 if (!ChainLength) {
3163 Mask.assign(OuterMask);
3164 YMask.assign(LocalYMask);
3165 OldCost = NewCost = LocalOldCost;
3166 Trunk = OuterV0;
3167 ChainLength++;
3168 continue;
3169 }
3170
3171 // For the non-root case, first attempt to combine masks.
3172 SmallVector<int> NewYMask(YMask);
3173 bool Valid = true;
3174 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3175 if (LeafM == -1 || CombinedM == LeafM)
3176 continue;
3177 if (CombinedM == -1) {
3178 CombinedM = LeafM;
3179 } else {
3180 Valid = false;
3181 break;
3182 }
3183 }
3184 if (!Valid)
3185 break;
3186
3187 SmallVector<int> NewMask;
3188 NewMask.reserve(NumTrunkElts);
3189 for (int M : Mask) {
3190 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3191 NewMask.push_back(M);
3192 else
3193 NewMask.push_back(OuterMask[M]);
3194 }
3195
3196 // Break the chain if adding this new step complicates the shuffles such
3197 // that it would increase the new cost by more than the old cost of this
3198 // step.
3199 InstructionCost LocalNewCost =
3201 YType, NewYMask, CostKind) +
3203 TrunkType, NewMask, CostKind);
3204
3205 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3206 break;
3207
3208 LLVM_DEBUG({
3209 if (ChainLength == 1) {
3210 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3211 << I << '\n';
3212 }
3213 dbgs() << " next chain link: " << *Trunk << '\n'
3214 << " old cost: " << (OldCost + LocalOldCost)
3215 << " new cost: " << LocalNewCost << '\n';
3216 });
3217
3218 Mask = NewMask;
3219 YMask = NewYMask;
3220 OldCost += LocalOldCost;
3221 NewCost = LocalNewCost;
3222 Trunk = OuterV0;
3223 ChainLength++;
3224 }
3225 if (ChainLength <= 1)
3226 return false;
3227
3228 if (llvm::all_of(Mask, [&](int M) {
3229 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3230 })) {
3231 // Produce a canonical simplified form if all elements are sourced from Y.
3232 for (int &M : Mask) {
3233 if (M >= static_cast<int>(NumTrunkElts))
3234 M = YMask[M - NumTrunkElts];
3235 }
3236 Value *Root =
3237 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3238 replaceValue(I, *Root);
3239 return true;
3240 }
3241
3242 Value *Leaf =
3243 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3244 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3245 replaceValue(I, *Root);
3246 return true;
3247}
3248
3249/// Try to convert
3250/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3251bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3252 Value *V0, *V1;
3253 ArrayRef<int> OldMask;
3254 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3255 return false;
3256
3257 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3258 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3259 if (!II0 || !II1)
3260 return false;
3261
3262 Intrinsic::ID IID = II0->getIntrinsicID();
3263 if (IID != II1->getIntrinsicID())
3264 return false;
3265 InstructionCost CostII0 =
3266 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3267 InstructionCost CostII1 =
3268 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3269
3270 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3271 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3272 if (!ShuffleDstTy || !II0Ty)
3273 return false;
3274
3275 if (!isTriviallyVectorizable(IID))
3276 return false;
3277
3278 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3280 II0->getArgOperand(I) != II1->getArgOperand(I))
3281 return false;
3282
3283 InstructionCost OldCost =
3284 CostII0 + CostII1 +
3286 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3287
3288 SmallVector<Type *> NewArgsTy;
3289 InstructionCost NewCost = 0;
3290 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3291 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3293 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3294 } else {
3295 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3296 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3297 ShuffleDstTy->getNumElements());
3298 NewArgsTy.push_back(ArgTy);
3299 std::pair<Value *, Value *> OperandPair =
3300 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3301 if (!SeenOperandPairs.insert(OperandPair).second) {
3302 // We've already computed the cost for this operand pair.
3303 continue;
3304 }
3305 NewCost += TTI.getShuffleCost(
3306 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3307 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3308 }
3309 }
3310 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3311
3312 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3313 if (!II0->hasOneUse())
3314 NewCost += CostII0;
3315 if (II1 != II0 && !II1->hasOneUse())
3316 NewCost += CostII1;
3317
3318 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3319 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3320 << "\n");
3321
3322 if (NewCost > OldCost)
3323 return false;
3324
3325 SmallVector<Value *> NewArgs;
3326 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3327 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3329 NewArgs.push_back(II0->getArgOperand(I));
3330 } else {
3331 std::pair<Value *, Value *> OperandPair =
3332 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3333 auto It = ShuffleCache.find(OperandPair);
3334 if (It != ShuffleCache.end()) {
3335 // Reuse previously created shuffle for this operand pair.
3336 NewArgs.push_back(It->second);
3337 continue;
3338 }
3339 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3340 II1->getArgOperand(I), OldMask);
3341 ShuffleCache[OperandPair] = Shuf;
3342 NewArgs.push_back(Shuf);
3343 Worklist.pushValue(Shuf);
3344 }
3345 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3346
3347 // Intersect flags from the old intrinsics.
3348 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3349 NewInst->copyIRFlags(II0);
3350 NewInst->andIRFlags(II1);
3351 }
3352
3353 replaceValue(I, *NewIntrinsic);
3354 return true;
3355}
3356
3357/// Try to convert
3358/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3359bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3360 Value *V0;
3361 ArrayRef<int> Mask;
3362 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3363 return false;
3364
3365 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3366 if (!II0)
3367 return false;
3368
3369 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3370 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3371 if (!ShuffleDstTy || !IntrinsicSrcTy)
3372 return false;
3373
3374 // Validate it's a pure permute, mask should only reference the first vector
3375 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3376 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3377 return false;
3378
3379 Intrinsic::ID IID = II0->getIntrinsicID();
3380 if (!isTriviallyVectorizable(IID))
3381 return false;
3382
3383 // Cost analysis
3385 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3386 InstructionCost OldCost =
3389 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3390
3391 SmallVector<Type *> NewArgsTy;
3392 InstructionCost NewCost = 0;
3393 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3395 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3396 } else {
3397 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3398 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3399 ShuffleDstTy->getNumElements());
3400 NewArgsTy.push_back(ArgTy);
3402 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3403 {II0->getArgOperand(I)});
3404 }
3405 }
3406 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3407 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3408
3409 // If the intrinsic has multiple uses, we need to account for the cost of
3410 // keeping the original intrinsic around.
3411 if (!II0->hasOneUse())
3412 NewCost += IntrinsicCost;
3413
3414 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3415 << OldCost << " vs NewCost: " << NewCost << "\n");
3416
3417 if (NewCost > OldCost)
3418 return false;
3419
3420 // Transform
3421 SmallVector<Value *> NewArgs;
3422 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3424 NewArgs.push_back(II0->getArgOperand(I));
3425 } else {
3426 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3427 NewArgs.push_back(Shuf);
3428 Worklist.pushValue(Shuf);
3429 }
3430 }
3431
3432 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3433
3434 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3435 NewInst->copyIRFlags(II0);
3436
3437 replaceValue(I, *NewIntrinsic);
3438 return true;
3439}
3440
3441using InstLane = std::pair<Use *, int>;
3442
3443static InstLane lookThroughShuffles(Use *U, int Lane) {
3444 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3445 unsigned NumElts =
3446 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3447 int M = SV->getMaskValue(Lane);
3448 if (M < 0)
3449 return {nullptr, PoisonMaskElem};
3450 if (static_cast<unsigned>(M) < NumElts) {
3451 U = &SV->getOperandUse(0);
3452 Lane = M;
3453 } else {
3454 U = &SV->getOperandUse(1);
3455 Lane = M - NumElts;
3456 }
3457 }
3458 return InstLane{U, Lane};
3459}
3460
3464 for (InstLane IL : Item) {
3465 auto [U, Lane] = IL;
3466 InstLane OpLane =
3467 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3468 Lane)
3469 : InstLane{nullptr, PoisonMaskElem};
3470 NItem.emplace_back(OpLane);
3471 }
3472 return NItem;
3473}
3474
3475/// Detect concat of multiple values into a vector
3477 const TargetTransformInfo &TTI) {
3478 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3479 unsigned NumElts = Ty->getNumElements();
3480 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3481 return false;
3482
3483 // Check that the concat is free, usually meaning that the type will be split
3484 // during legalization.
3485 SmallVector<int, 16> ConcatMask(NumElts * 2);
3486 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3487 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3488 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3489 Ty, ConcatMask, CostKind) != 0)
3490 return false;
3491
3492 unsigned NumSlices = Item.size() / NumElts;
3493 // Currently we generate a tree of shuffles for the concats, which limits us
3494 // to a power2.
3495 if (!isPowerOf2_32(NumSlices))
3496 return false;
3497 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3498 Use *SliceV = Item[Slice * NumElts].first;
3499 if (!SliceV || SliceV->get()->getType() != Ty)
3500 return false;
3501 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3502 auto [V, Lane] = Item[Slice * NumElts + Elt];
3503 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3504 return false;
3505 }
3506 }
3507 return true;
3508}
3509
3511 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3512 const SmallPtrSet<Use *, 4> &SplatLeafs,
3513 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3514 IRBuilderBase &Builder,
3515 const TargetTransformInfo *TTI) {
3516 auto [FrontU, FrontLane] = Item.front();
3517
3518 if (IdentityLeafs.contains(FrontU)) {
3519 return FrontU->get();
3520 }
3521 if (SplatLeafs.contains(FrontU)) {
3522 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3523 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3524 }
3525 if (ConcatLeafs.contains(FrontU)) {
3526 unsigned NumElts =
3527 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3528 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3529 for (unsigned S = 0; S < Values.size(); ++S)
3530 Values[S] = Item[S * NumElts].first->get();
3531
3532 while (Values.size() > 1) {
3533 NumElts *= 2;
3534 SmallVector<int, 16> Mask(NumElts, 0);
3535 std::iota(Mask.begin(), Mask.end(), 0);
3536 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3537 for (unsigned S = 0; S < NewValues.size(); ++S)
3538 NewValues[S] =
3539 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3540 Values = NewValues;
3541 }
3542 return Values[0];
3543 }
3544
3545 auto *I = cast<Instruction>(FrontU->get());
3546 auto *II = dyn_cast<IntrinsicInst>(I);
3547 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3549 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3550 if (II &&
3551 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3552 Ops[Idx] = II->getOperand(Idx);
3553 continue;
3554 }
3556 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3557 Builder, TTI);
3558 }
3559
3560 SmallVector<Value *, 8> ValueList;
3561 for (const auto &Lane : Item)
3562 if (Lane.first)
3563 ValueList.push_back(Lane.first->get());
3564
3565 Type *DstTy =
3566 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3567 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3568 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3569 Ops[0], Ops[1]);
3570 propagateIRFlags(Value, ValueList);
3571 return Value;
3572 }
3573 if (auto *CI = dyn_cast<CmpInst>(I)) {
3574 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3575 propagateIRFlags(Value, ValueList);
3576 return Value;
3577 }
3578 if (auto *SI = dyn_cast<SelectInst>(I)) {
3579 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3580 propagateIRFlags(Value, ValueList);
3581 return Value;
3582 }
3583 if (auto *CI = dyn_cast<CastInst>(I)) {
3584 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3585 propagateIRFlags(Value, ValueList);
3586 return Value;
3587 }
3588 if (II) {
3589 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3590 propagateIRFlags(Value, ValueList);
3591 return Value;
3592 }
3593 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3594 auto *Value =
3595 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3596 propagateIRFlags(Value, ValueList);
3597 return Value;
3598}
3599
3600// Starting from a shuffle, look up through operands tracking the shuffled index
3601// of each lane. If we can simplify away the shuffles to identities then
3602// do so.
3603bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3604 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3605 if (!Ty || I.use_empty())
3606 return false;
3607
3608 SmallVector<InstLane> Start(Ty->getNumElements());
3609 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3610 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3611
3613 Worklist.push_back(Start);
3614 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3615 unsigned NumVisited = 0;
3616
3617 while (!Worklist.empty()) {
3618 if (++NumVisited > MaxInstrsToScan)
3619 return false;
3620
3621 SmallVector<InstLane> Item = Worklist.pop_back_val();
3622 auto [FrontU, FrontLane] = Item.front();
3623
3624 // If we found an undef first lane then bail out to keep things simple.
3625 if (!FrontU)
3626 return false;
3627
3628 // Helper to peek through bitcasts to the same value.
3629 auto IsEquiv = [&](Value *X, Value *Y) {
3630 return X->getType() == Y->getType() &&
3632 };
3633
3634 // Look for an identity value.
3635 if (FrontLane == 0 &&
3636 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3637 Ty->getNumElements() &&
3638 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3639 Value *FrontV = Item.front().first->get();
3640 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3641 E.value().second == (int)E.index());
3642 })) {
3643 IdentityLeafs.insert(FrontU);
3644 continue;
3645 }
3646 // Look for constants, for the moment only supporting constant splats.
3647 if (auto *C = dyn_cast<Constant>(FrontU);
3648 C && C->getSplatValue() &&
3649 all_of(drop_begin(Item), [Item](InstLane &IL) {
3650 Value *FrontV = Item.front().first->get();
3651 Use *U = IL.first;
3652 return !U || (isa<Constant>(U->get()) &&
3653 cast<Constant>(U->get())->getSplatValue() ==
3654 cast<Constant>(FrontV)->getSplatValue());
3655 })) {
3656 SplatLeafs.insert(FrontU);
3657 continue;
3658 }
3659 // Look for a splat value.
3660 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3661 auto [FrontU, FrontLane] = Item.front();
3662 auto [U, Lane] = IL;
3663 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3664 })) {
3665 SplatLeafs.insert(FrontU);
3666 continue;
3667 }
3668
3669 // We need each element to be the same type of value, and check that each
3670 // element has a single use.
3671 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3672 Value *FrontV = Item.front().first->get();
3673 if (!IL.first)
3674 return true;
3675 Value *V = IL.first->get();
3676 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3677 return false;
3678 if (V->getValueID() != FrontV->getValueID())
3679 return false;
3680 if (auto *CI = dyn_cast<CmpInst>(V))
3681 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3682 return false;
3683 if (auto *CI = dyn_cast<CastInst>(V))
3684 if (CI->getSrcTy()->getScalarType() !=
3685 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3686 return false;
3687 if (auto *SI = dyn_cast<SelectInst>(V))
3688 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3689 SI->getOperand(0)->getType() !=
3690 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3691 return false;
3692 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3693 return false;
3694 auto *II = dyn_cast<IntrinsicInst>(V);
3695 return !II || (isa<IntrinsicInst>(FrontV) &&
3696 II->getIntrinsicID() ==
3697 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3698 !II->hasOperandBundles());
3699 };
3700 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3701 // Check the operator is one that we support.
3702 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3703 // We exclude div/rem in case they hit UB from poison lanes.
3704 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3705 BO && BO->isIntDivRem())
3706 return false;
3709 continue;
3710 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3711 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3713 continue;
3714 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3715 // TODO: Handle vector widening/narrowing bitcasts.
3716 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3717 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3718 if (DstTy && SrcTy &&
3719 SrcTy->getNumElements() == DstTy->getNumElements()) {
3721 continue;
3722 }
3723 } else if (isa<SelectInst>(FrontU)) {
3727 continue;
3728 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3729 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3730 !II->hasOperandBundles()) {
3731 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3732 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3733 &TTI)) {
3734 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3735 Value *FrontV = Item.front().first->get();
3736 Use *U = IL.first;
3737 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3738 cast<Instruction>(FrontV)->getOperand(Op));
3739 }))
3740 return false;
3741 continue;
3742 }
3744 }
3745 continue;
3746 }
3747 }
3748
3749 if (isFreeConcat(Item, CostKind, TTI)) {
3750 ConcatLeafs.insert(FrontU);
3751 continue;
3752 }
3753
3754 return false;
3755 }
3756
3757 if (NumVisited <= 1)
3758 return false;
3759
3760 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3761
3762 // If we got this far, we know the shuffles are superfluous and can be
3763 // removed. Scan through again and generate the new tree of instructions.
3764 Builder.SetInsertPoint(&I);
3765 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3766 ConcatLeafs, Builder, &TTI);
3767 replaceValue(I, *V);
3768 return true;
3769}
3770
3771/// Given a commutative reduction, the order of the input lanes does not alter
3772/// the results. We can use this to remove certain shuffles feeding the
3773/// reduction, removing the need to shuffle at all.
3774bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3775 auto *II = dyn_cast<IntrinsicInst>(&I);
3776 if (!II)
3777 return false;
3778 switch (II->getIntrinsicID()) {
3779 case Intrinsic::vector_reduce_add:
3780 case Intrinsic::vector_reduce_mul:
3781 case Intrinsic::vector_reduce_and:
3782 case Intrinsic::vector_reduce_or:
3783 case Intrinsic::vector_reduce_xor:
3784 case Intrinsic::vector_reduce_smin:
3785 case Intrinsic::vector_reduce_smax:
3786 case Intrinsic::vector_reduce_umin:
3787 case Intrinsic::vector_reduce_umax:
3788 break;
3789 default:
3790 return false;
3791 }
3792
3793 // Find all the inputs when looking through operations that do not alter the
3794 // lane order (binops, for example). Currently we look for a single shuffle,
3795 // and can ignore splat values.
3796 std::queue<Value *> Worklist;
3797 SmallPtrSet<Value *, 4> Visited;
3798 ShuffleVectorInst *Shuffle = nullptr;
3799 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3800 Worklist.push(Op);
3801
3802 while (!Worklist.empty()) {
3803 Value *CV = Worklist.front();
3804 Worklist.pop();
3805 if (Visited.contains(CV))
3806 continue;
3807
3808 // Splats don't change the order, so can be safely ignored.
3809 if (isSplatValue(CV))
3810 continue;
3811
3812 Visited.insert(CV);
3813
3814 if (auto *CI = dyn_cast<Instruction>(CV)) {
3815 if (CI->isBinaryOp()) {
3816 for (auto *Op : CI->operand_values())
3817 Worklist.push(Op);
3818 continue;
3819 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3820 if (Shuffle && Shuffle != SV)
3821 return false;
3822 Shuffle = SV;
3823 continue;
3824 }
3825 }
3826
3827 // Anything else is currently an unknown node.
3828 return false;
3829 }
3830
3831 if (!Shuffle)
3832 return false;
3833
3834 // Check all uses of the binary ops and shuffles are also included in the
3835 // lane-invariant operations (Visited should be the list of lanewise
3836 // instructions, including the shuffle that we found).
3837 for (auto *V : Visited)
3838 for (auto *U : V->users())
3839 if (!Visited.contains(U) && U != &I)
3840 return false;
3841
3842 FixedVectorType *VecType =
3843 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3844 if (!VecType)
3845 return false;
3846 FixedVectorType *ShuffleInputType =
3848 if (!ShuffleInputType)
3849 return false;
3850 unsigned NumInputElts = ShuffleInputType->getNumElements();
3851
3852 // Find the mask from sorting the lanes into order. This is most likely to
3853 // become a identity or concat mask. Undef elements are pushed to the end.
3854 SmallVector<int> ConcatMask;
3855 Shuffle->getShuffleMask(ConcatMask);
3856 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3857 bool UsesSecondVec =
3858 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3859
3861 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3862 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3864 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3865 ShuffleInputType, ConcatMask, CostKind);
3866
3867 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3868 << "\n");
3869 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3870 << "\n");
3871 bool MadeChanges = false;
3872 if (NewCost < OldCost) {
3873 Builder.SetInsertPoint(Shuffle);
3874 Value *NewShuffle = Builder.CreateShuffleVector(
3875 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3876 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3877 replaceValue(*Shuffle, *NewShuffle);
3878 return true;
3879 }
3880
3881 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3882 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3883 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3884 return MadeChanges;
3885}
3886
3887/// For a given chain of patterns of the following form:
3888///
3889/// ```
3890/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3891///
3892/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3893/// ty1> %1)
3894/// OR
3895/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3896///
3897/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3898/// ...
3899/// ...
3900/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3901/// 3), <n x ty1> %(i - 2)
3902/// OR
3903/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3904///
3905/// %(i) = extractelement <n x ty1> %(i - 1), 0
3906/// ```
3907///
3908/// Where:
3909/// `mask` follows a partition pattern:
3910///
3911/// Ex:
3912/// [n = 8, p = poison]
3913///
3914/// 4 5 6 7 | p p p p
3915/// 2 3 | p p p p p p
3916/// 1 | p p p p p p p
3917///
3918/// For powers of 2, there's a consistent pattern, but for other cases
3919/// the parity of the current half value at each step decides the
3920/// next partition half (see `ExpectedParityMask` for more logical details
3921/// in generalising this).
3922///
3923/// Ex:
3924/// [n = 6]
3925///
3926/// 3 4 5 | p p p
3927/// 1 2 | p p p p
3928/// 1 | p p p p p
3929bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3930 // Going bottom-up for the pattern.
3931 std::queue<Value *> InstWorklist;
3932 InstructionCost OrigCost = 0;
3933
3934 // Common instruction operation after each shuffle op.
3935 std::optional<unsigned int> CommonCallOp = std::nullopt;
3936 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3937
3938 bool IsFirstCallOrBinInst = true;
3939 bool ShouldBeCallOrBinInst = true;
3940
3941 // This stores the last used instructions for shuffle/common op.
3942 //
3943 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3944 // instructions from either shuffle/common op.
3945 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3946
3947 Value *VecOpEE;
3948 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3949 return false;
3950
3951 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3952 if (!FVT)
3953 return false;
3954
3955 int64_t VecSize = FVT->getNumElements();
3956 if (VecSize < 2)
3957 return false;
3958
3959 // Number of levels would be ~log2(n), considering we always partition
3960 // by half for this fold pattern.
3961 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3962 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3963
3964 // This is how we generalise for all element sizes.
3965 // At each step, if vector size is odd, we need non-poison
3966 // values to cover the dominant half so we don't miss out on any element.
3967 //
3968 // This mask will help us retrieve this as we go from bottom to top:
3969 //
3970 // Mask Set -> N = N * 2 - 1
3971 // Mask Unset -> N = N * 2
3972 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3973 Cur = (Cur + 1) / 2, --Mask) {
3974 if (Cur & 1)
3975 ExpectedParityMask |= (1ll << Mask);
3976 }
3977
3978 InstWorklist.push(VecOpEE);
3979
3980 while (!InstWorklist.empty()) {
3981 Value *CI = InstWorklist.front();
3982 InstWorklist.pop();
3983
3984 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3985 if (!ShouldBeCallOrBinInst)
3986 return false;
3987
3988 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
3989 return false;
3990
3991 // For the first found call/bin op, the vector has to come from the
3992 // extract element op.
3993 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3994 return false;
3995 IsFirstCallOrBinInst = false;
3996
3997 if (!CommonCallOp)
3998 CommonCallOp = II->getIntrinsicID();
3999 if (II->getIntrinsicID() != *CommonCallOp)
4000 return false;
4001
4002 switch (II->getIntrinsicID()) {
4003 case Intrinsic::umin:
4004 case Intrinsic::umax:
4005 case Intrinsic::smin:
4006 case Intrinsic::smax: {
4007 auto *Op0 = II->getOperand(0);
4008 auto *Op1 = II->getOperand(1);
4009 PrevVecV[0] = Op0;
4010 PrevVecV[1] = Op1;
4011 break;
4012 }
4013 default:
4014 return false;
4015 }
4016 ShouldBeCallOrBinInst ^= 1;
4017
4018 IntrinsicCostAttributes ICA(
4019 *CommonCallOp, II->getType(),
4020 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
4021 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4022
4023 // We may need a swap here since it can be (a, b) or (b, a)
4024 // and accordingly change as we go up.
4025 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4026 std::swap(PrevVecV[0], PrevVecV[1]);
4027 InstWorklist.push(PrevVecV[1]);
4028 InstWorklist.push(PrevVecV[0]);
4029 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
4030 // Similar logic for bin ops.
4031
4032 if (!ShouldBeCallOrBinInst)
4033 return false;
4034
4035 if (!IsFirstCallOrBinInst && any_of(PrevVecV, equal_to(nullptr)))
4036 return false;
4037
4038 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
4039 return false;
4040 IsFirstCallOrBinInst = false;
4041
4042 if (!CommonBinOp)
4043 CommonBinOp = BinOp->getOpcode();
4044
4045 if (BinOp->getOpcode() != *CommonBinOp)
4046 return false;
4047
4048 switch (*CommonBinOp) {
4049 case BinaryOperator::Add:
4050 case BinaryOperator::Mul:
4051 case BinaryOperator::Or:
4052 case BinaryOperator::And:
4053 case BinaryOperator::Xor: {
4054 auto *Op0 = BinOp->getOperand(0);
4055 auto *Op1 = BinOp->getOperand(1);
4056 PrevVecV[0] = Op0;
4057 PrevVecV[1] = Op1;
4058 break;
4059 }
4060 default:
4061 return false;
4062 }
4063 ShouldBeCallOrBinInst ^= 1;
4064
4065 OrigCost +=
4066 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
4067
4068 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
4069 std::swap(PrevVecV[0], PrevVecV[1]);
4070 InstWorklist.push(PrevVecV[1]);
4071 InstWorklist.push(PrevVecV[0]);
4072 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
4073 // We shouldn't have any null values in the previous vectors,
4074 // is so, there was a mismatch in pattern.
4075 if (ShouldBeCallOrBinInst || any_of(PrevVecV, equal_to(nullptr)))
4076 return false;
4077
4078 if (SVInst != PrevVecV[1])
4079 return false;
4080
4081 ArrayRef<int> CurMask;
4082 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
4083 m_Mask(CurMask))))
4084 return false;
4085
4086 // Subtract the parity mask when checking the condition.
4087 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
4088 if (Mask < ShuffleMaskHalf &&
4089 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
4090 return false;
4091 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
4092 return false;
4093 }
4094
4095 // Update mask values.
4096 ShuffleMaskHalf *= 2;
4097 ShuffleMaskHalf -= (ExpectedParityMask & 1);
4098 ExpectedParityMask >>= 1;
4099
4101 SVInst->getType(), SVInst->getType(),
4102 CurMask, CostKind);
4103
4104 VisitedCnt += 1;
4105 if (!ExpectedParityMask && VisitedCnt == NumLevels)
4106 break;
4107
4108 ShouldBeCallOrBinInst ^= 1;
4109 } else {
4110 return false;
4111 }
4112 }
4113
4114 // Pattern should end with a shuffle op.
4115 if (ShouldBeCallOrBinInst)
4116 return false;
4117
4118 assert(VecSize != -1 && "Expected Match for Vector Size");
4119
4120 Value *FinalVecV = PrevVecV[0];
4121 if (!FinalVecV)
4122 return false;
4123
4124 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
4125
4126 Intrinsic::ID ReducedOp =
4127 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4128 : getReductionForBinop(*CommonBinOp));
4129 if (!ReducedOp)
4130 return false;
4131
4132 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
4134
4135 if (NewCost >= OrigCost)
4136 return false;
4137
4138 auto *ReducedResult =
4139 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
4140 replaceValue(I, *ReducedResult);
4141
4142 return true;
4143}
4144
4145/// Determine if its more efficient to fold:
4146/// reduce(trunc(x)) -> trunc(reduce(x)).
4147/// reduce(sext(x)) -> sext(reduce(x)).
4148/// reduce(zext(x)) -> zext(reduce(x)).
4149bool VectorCombine::foldCastFromReductions(Instruction &I) {
4150 auto *II = dyn_cast<IntrinsicInst>(&I);
4151 if (!II)
4152 return false;
4153
4154 bool TruncOnly = false;
4155 Intrinsic::ID IID = II->getIntrinsicID();
4156 switch (IID) {
4157 case Intrinsic::vector_reduce_add:
4158 case Intrinsic::vector_reduce_mul:
4159 TruncOnly = true;
4160 break;
4161 case Intrinsic::vector_reduce_and:
4162 case Intrinsic::vector_reduce_or:
4163 case Intrinsic::vector_reduce_xor:
4164 break;
4165 default:
4166 return false;
4167 }
4168
4169 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4170 Value *ReductionSrc = I.getOperand(0);
4171
4172 Value *Src;
4173 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4174 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4175 return false;
4176
4177 auto CastOpc =
4178 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4179
4180 auto *SrcTy = cast<VectorType>(Src->getType());
4181 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4182 Type *ResultTy = I.getType();
4183
4185 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4186 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4188 cast<CastInst>(ReductionSrc));
4189 InstructionCost NewCost =
4190 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4191 CostKind) +
4192 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4194
4195 if (OldCost <= NewCost || !NewCost.isValid())
4196 return false;
4197
4198 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4199 II->getIntrinsicID(), {Src});
4200 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4201 replaceValue(I, *NewCast);
4202 return true;
4203}
4204
4205/// Fold:
4206/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4207/// into:
4208/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4209///
4210/// Sign-bit reductions produce values with known semantics:
4211/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4212/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4213/// - reduce.add: count of negative elements (0 to NumElts)
4214///
4215/// Both lshr and ashr are supported:
4216/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4217/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4218///
4219/// We transform to a direct sign check on the original vector using
4220/// reduce.{or,umax} or reduce.{and,umin}.
4221///
4222/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4223bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4224 CmpPredicate Pred;
4225 Value *ReduceOp;
4226 const APInt *CmpVal;
4227 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4228 return false;
4229
4230 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4231 if (!II || !II->hasOneUse())
4232 return false;
4233
4234 Intrinsic::ID OrigIID = II->getIntrinsicID();
4235 switch (OrigIID) {
4236 case Intrinsic::vector_reduce_or:
4237 case Intrinsic::vector_reduce_umax:
4238 case Intrinsic::vector_reduce_and:
4239 case Intrinsic::vector_reduce_umin:
4240 case Intrinsic::vector_reduce_add:
4241 break;
4242 default:
4243 return false;
4244 }
4245
4246 Value *ReductionSrc = II->getArgOperand(0);
4247 if (!ReductionSrc->hasOneUse())
4248 return false;
4249
4250 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4251 if (!VecTy)
4252 return false;
4253
4254 unsigned BitWidth = VecTy->getScalarSizeInBits();
4255 if (BitWidth == 1)
4256 return false;
4257
4258 unsigned NumElts = VecTy->getNumElements();
4259
4260 // For reduce.add, NumElts must fit as a signed integer for the range
4261 // calculations to be correct. Both lshr [0, N] and ashr [-N, 0] require
4262 // N to be representable as a positive signed value.
4263 if (OrigIID == Intrinsic::vector_reduce_add && !isIntN(BitWidth, NumElts))
4264 return false;
4265
4266 // Match sign-bit extraction: shr X, (bitwidth-1)
4267 Value *X;
4268 if (!match(ReductionSrc, m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))
4269 return false;
4270
4271 // Compute the boundary value when all elements are negative:
4272 // - Per-element contribution: 1 for lshr, -1 for ashr
4273 // - For add: N * per-element; for others: just per-element
4274 bool IsAShr = isa<AShrOperator>(ReductionSrc);
4275 unsigned Count = (OrigIID == Intrinsic::vector_reduce_add) ? NumElts : 1;
4276 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4277 if (IsAShr)
4278 NegativeVal.negate();
4279
4280 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4281 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4282 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4283 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4284
4285 // Determine comparison semantics:
4286 // - IsEq: true for equality test, false for inequality
4287 // - TestsNegative: true if testing against AllNegVal, false for zero
4288 //
4289 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4290 // that fold to boundary tests given the narrow value range:
4291 // < RangeHigh -> != RangeHigh
4292 // > RangeHigh-1 -> == RangeHigh
4293 // > RangeLow -> != RangeLow
4294 // < RangeLow+1 -> == RangeLow
4295 //
4296 // For inequalities, we work with signed predicates only. Unsigned predicates
4297 // are canonicalized to signed when the range is non-negative (where they are
4298 // equivalent). When the range includes negative values, unsigned predicates
4299 // would have different semantics due to wrap-around, so we reject them.
4300 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4301 if (RangeLow.isNegative())
4302 return false;
4303 Pred = ICmpInst::getSignedPredicate(Pred);
4304 }
4305
4306 bool IsEq;
4307 bool TestsNegative;
4308 if (ICmpInst::isEquality(Pred)) {
4309 if (CmpVal->isZero()) {
4310 TestsNegative = false;
4311 } else if (*CmpVal == NegativeVal) {
4312 TestsNegative = true;
4313 } else {
4314 return false;
4315 }
4316 IsEq = Pred == ICmpInst::ICMP_EQ;
4317 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4318 IsEq = false;
4319 TestsNegative = (RangeHigh == NegativeVal);
4320 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4321 IsEq = true;
4322 TestsNegative = (RangeHigh == NegativeVal);
4323 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4324 IsEq = false;
4325 TestsNegative = (RangeLow == NegativeVal);
4326 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4327 IsEq = true;
4328 TestsNegative = (RangeLow == NegativeVal);
4329 } else {
4330 return false;
4331 }
4332
4333 // For this fold we support four types of checks:
4334 //
4335 // 1. All lanes are negative - AllNeg
4336 // 2. All lanes are non-negative - AllNonNeg
4337 // 3. At least one negative lane - AnyNeg
4338 // 4. At least one non-negative lane - AnyNonNeg
4339 //
4340 // For each case, we can generate the following code:
4341 //
4342 // 1. AllNeg - reduce.and/umin(X) < 0
4343 // 2. AllNonNeg - reduce.or/umax(X) > -1
4344 // 3. AnyNeg - reduce.or/umax(X) < 0
4345 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4346 //
4347 // The table below shows the aggregation of all supported cases
4348 // using these four cases.
4349 //
4350 // Reduction | == 0 | != 0 | == MAX | != MAX
4351 // ------------+-----------+-----------+-----------+-----------
4352 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4353 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4354 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4355 //
4356 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4357 //
4358 // For easier codegen and check inversion, we use the following encoding:
4359 //
4360 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4361 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4362 // 3. Bit-1 === universal (1) or existential (0) check
4363 //
4364 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4365 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4366 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4367 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4368 //
4369 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4370 //
4371 enum CheckKind : unsigned {
4372 AnyNonNeg = 0b000,
4373 AllNeg = 0b011,
4374 AllNonNeg = 0b101,
4375 AnyNeg = 0b110,
4376 };
4377 // Return true if we fold this check into or/umax and false for and/umin
4378 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4379 // Return true if we should check if result is negative and false otherwise
4380 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4381 // Logically invert the check
4382 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4383
4384 CheckKind Base;
4385 switch (OrigIID) {
4386 case Intrinsic::vector_reduce_or:
4387 case Intrinsic::vector_reduce_umax:
4388 Base = TestsNegative ? AnyNeg : AllNonNeg;
4389 break;
4390 case Intrinsic::vector_reduce_and:
4391 case Intrinsic::vector_reduce_umin:
4392 Base = TestsNegative ? AllNeg : AnyNonNeg;
4393 break;
4394 case Intrinsic::vector_reduce_add:
4395 Base = TestsNegative ? AllNeg : AllNonNeg;
4396 break;
4397 default:
4398 llvm_unreachable("Unexpected intrinsic");
4399 }
4400
4401 CheckKind Check = IsEq ? Base : Invert(Base);
4402
4403 // Calculate old cost: shift + reduction
4404 InstructionCost OldCost =
4406 OldCost += TTI.getInstructionCost(II, CostKind);
4407
4408 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4409 InstructionCost ArithCost =
4411 VecTy, std::nullopt, CostKind);
4412 InstructionCost MinMaxCost =
4414 FastMathFlags(), CostKind);
4415 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4416 : std::make_pair(MinMax, MinMaxCost);
4417 };
4418
4419 // Choose output reduction based on encoding's MSB
4420 auto [NewIID, NewCost] = RequiresOr(Check)
4421 ? PickCheaper(Intrinsic::vector_reduce_or,
4422 Intrinsic::vector_reduce_umax)
4423 : PickCheaper(Intrinsic::vector_reduce_and,
4424 Intrinsic::vector_reduce_umin);
4425
4426 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4427 << OldCost << " vs NewCost: " << NewCost << "\n");
4428
4429 if (NewCost > OldCost)
4430 return false;
4431
4432 // Generate comparison based on encoding's neg bit: slt 0 for neg, sgt -1 for
4433 // non-neg
4434 Builder.SetInsertPoint(&I);
4435 Type *ScalarTy = VecTy->getScalarType();
4436 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {X});
4437 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4438 : Builder.CreateIsNotNeg(NewReduce);
4439 replaceValue(I, *NewCmp);
4440 return true;
4441}
4442
4443/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4444///
4445/// We can prove it for cases when:
4446///
4447/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4448/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4449/// 2. f(x) == 0 <=> x == 0
4450///
4451/// From 1 and 2 (or 1' and 2), we can infer that
4452///
4453/// OP f(X_i) == 0 <=> OP X_i == 0.
4454///
4455/// (1)
4456/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4457/// (2)
4458/// <=> \forall i \in [1, N] X_i == 0
4459/// (1)
4460/// <=> OP(X_i) == 0
4461///
4462/// For some of the OP's and f's, we need to have domain constraints on X
4463/// to ensure properties 1 (or 1') and 2.
4464bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4465 CmpPredicate Pred;
4466 Value *Op;
4467 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4468 !ICmpInst::isEquality(Pred))
4469 return false;
4470
4471 auto *II = dyn_cast<IntrinsicInst>(Op);
4472 if (!II)
4473 return false;
4474
4475 switch (II->getIntrinsicID()) {
4476 case Intrinsic::vector_reduce_add:
4477 case Intrinsic::vector_reduce_or:
4478 case Intrinsic::vector_reduce_umin:
4479 case Intrinsic::vector_reduce_umax:
4480 case Intrinsic::vector_reduce_smin:
4481 case Intrinsic::vector_reduce_smax:
4482 break;
4483 default:
4484 return false;
4485 }
4486
4487 Value *InnerOp = II->getArgOperand(0);
4488
4489 // TODO: fixed vector type might be too restrictive
4490 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4491 return false;
4492
4493 Value *X = nullptr;
4494
4495 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4496 //
4497 // 1. f(x) = shl nuw x, y for arbitrary y
4498 // 2. f(x) = mul nuw x, c for defined c != 0
4499 // 3. f(x) = zext x
4500 // 4. f(x) = sext x
4501 // 5. f(x) = neg x
4502 //
4503 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4504 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4505 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4506 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4507 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4508 ))
4509 return false;
4510
4511 SimplifyQuery S = SQ.getWithInstruction(&I);
4512 auto *XTy = cast<FixedVectorType>(X->getType());
4513
4514 // Check for domain constraints for all supported reductions.
4515 //
4516 // a. OR X_i - has property 1 for every X
4517 // b. UMAX X_i - has property 1 for every X
4518 // c. UMIN X_i - has property 1' for every X
4519 // d. SMAX X_i - has property 1 for X >= 0
4520 // e. SMIN X_i - has property 1' for X >= 0
4521 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4522 //
4523 // In order for the proof to work, we need 1 (or 1') to be true for both
4524 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4525 //
4526 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4527 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4528 // of known bits, we can't reasonably hold knowledge of "either 0
4529 // or negative".
4530 switch (II->getIntrinsicID()) {
4531 case Intrinsic::vector_reduce_add: {
4532 // We need to check that both X_i and f(X_i) have enough leading
4533 // zeros to not overflow.
4534 KnownBits KnownX = computeKnownBits(X, S);
4535 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4536 unsigned NumElems = XTy->getNumElements();
4537 // Adding N elements loses at most ceil(log2(N)) leading bits.
4538 unsigned LostBits = Log2_32_Ceil(NumElems);
4539 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4540 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4541 // Need at least one leading zero left after summation to ensure no overflow
4542 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4543 return false;
4544
4545 // We are not checking whether X or f(X) are positive explicitly because
4546 // we implicitly checked for it when we checked if both cases have enough
4547 // leading zeros to not wrap addition.
4548 break;
4549 }
4550 case Intrinsic::vector_reduce_smin:
4551 case Intrinsic::vector_reduce_smax:
4552 // Check whether X >= 0 and f(X) >= 0
4553 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4554 return false;
4555
4556 break;
4557 default:
4558 break;
4559 };
4560
4561 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4562 << *II << "\n");
4563
4564 // For zext/sext, check if the transform is profitable using cost model.
4565 // For other operations (shl, mul, neg), we're removing an instruction
4566 // while keeping the same reduction type, so it's always profitable.
4567 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4568 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4569 Intrinsic::ID IID = II->getIntrinsicID();
4570
4572 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4574
4575 InstructionCost OldReduceCost, NewReduceCost;
4576 switch (IID) {
4577 case Intrinsic::vector_reduce_add:
4578 case Intrinsic::vector_reduce_or:
4579 OldReduceCost = TTI.getArithmeticReductionCost(
4580 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4581 NewReduceCost = TTI.getArithmeticReductionCost(
4582 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4583 break;
4584 case Intrinsic::vector_reduce_umin:
4585 case Intrinsic::vector_reduce_umax:
4586 case Intrinsic::vector_reduce_smin:
4587 case Intrinsic::vector_reduce_smax:
4588 OldReduceCost = TTI.getMinMaxReductionCost(
4589 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4590 NewReduceCost = TTI.getMinMaxReductionCost(
4591 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4592 break;
4593 default:
4594 llvm_unreachable("Unexpected reduction");
4595 }
4596
4597 InstructionCost OldCost = OldReduceCost + ExtCost;
4598 InstructionCost NewCost =
4599 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4600
4601 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4602 << *InnerOp << "\n OldCost: " << OldCost
4603 << " vs NewCost: " << NewCost << "\n");
4604
4605 // We consider transformation to still be potentially beneficial even
4606 // when the costs are the same because we might remove a use from f(X)
4607 // and unlock other optimizations. Equal costs would just mean that we
4608 // didn't make it worse in the worst case.
4609 if (NewCost > OldCost)
4610 return false;
4611 }
4612
4613 // Since we support zext and sext as f, we might change the scalar type
4614 // of the intrinsic.
4615 Type *Ty = XTy->getScalarType();
4616 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4617 Value *NewCmp =
4618 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4619 replaceValue(I, *NewCmp);
4620 return true;
4621}
4622
4623/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4624/// based on cost, preserving the comparison semantics.
4625///
4626/// We use two fundamental properties for each pair:
4627///
4628/// 1. or(X) == 0 <=> umax(X) == 0
4629/// 2. or(X) == 1 <=> umax(X) == 1
4630/// 3. sign(or(X)) == sign(umax(X))
4631///
4632/// 1. and(X) == -1 <=> umin(X) == -1
4633/// 2. and(X) == -2 <=> umin(X) == -2
4634/// 3. sign(and(X)) == sign(umin(X))
4635///
4636/// From these we can infer the following transformations:
4637/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4638/// b. or(X) s< 0 <-> umax(X) s< 0
4639/// c. or(X) s> -1 <-> umax(X) s> -1
4640/// d. or(X) s< 1 <-> umax(X) s< 1
4641/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4642/// f. or(X) s< 2 <-> umax(X) s< 2
4643/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4644/// h. and(X) s< 0 <-> umin(X) s< 0
4645/// i. and(X) s> -1 <-> umin(X) s> -1
4646/// j. and(X) s> -2 <-> umin(X) s> -2
4647/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4648/// l. and(X) s> -3 <-> umin(X) s> -3
4649///
4650bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4651 CmpPredicate Pred;
4652 Value *ReduceOp;
4653 const APInt *CmpVal;
4654 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4655 return false;
4656
4657 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4658 if (!II || !II->hasOneUse())
4659 return false;
4660
4661 const auto IsValidOrUmaxCmp = [&]() {
4662 // or === umax for i1
4663 if (CmpVal->getBitWidth() == 1)
4664 return true;
4665
4666 // Cases a and e
4667 bool IsEquality =
4668 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4669 // Case c
4670 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4671 // Cases b, d, and f
4672 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4673 Pred == ICmpInst::ICMP_SLT;
4674 return IsEquality || IsPositive || IsNegative;
4675 };
4676
4677 const auto IsValidAndUminCmp = [&]() {
4678 // and === umin for i1
4679 if (CmpVal->getBitWidth() == 1)
4680 return true;
4681
4682 const auto LeadingOnes = CmpVal->countl_one();
4683
4684 // Cases g and k
4685 bool IsEquality =
4686 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4688 // Case h
4689 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4690 // Cases i, j, and l
4691 bool IsPositive =
4692 // if the number has at least N - 2 leading ones
4693 // and the two LSBs are:
4694 // - 1 x 1 -> -1
4695 // - 1 x 0 -> -2
4696 // - 0 x 1 -> -3
4697 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4698 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4699 return IsEquality || IsNegative || IsPositive;
4700 };
4701
4702 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4703 Intrinsic::ID AlternativeIID;
4704
4705 // Check if this is a valid comparison pattern and determine the alternate
4706 // reduction intrinsic.
4707 switch (OriginalIID) {
4708 case Intrinsic::vector_reduce_or:
4709 if (!IsValidOrUmaxCmp())
4710 return false;
4711 AlternativeIID = Intrinsic::vector_reduce_umax;
4712 break;
4713 case Intrinsic::vector_reduce_umax:
4714 if (!IsValidOrUmaxCmp())
4715 return false;
4716 AlternativeIID = Intrinsic::vector_reduce_or;
4717 break;
4718 case Intrinsic::vector_reduce_and:
4719 if (!IsValidAndUminCmp())
4720 return false;
4721 AlternativeIID = Intrinsic::vector_reduce_umin;
4722 break;
4723 case Intrinsic::vector_reduce_umin:
4724 if (!IsValidAndUminCmp())
4725 return false;
4726 AlternativeIID = Intrinsic::vector_reduce_and;
4727 break;
4728 default:
4729 return false;
4730 }
4731
4732 Value *X = II->getArgOperand(0);
4733 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4734 if (!VecTy)
4735 return false;
4736
4737 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4738 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4739 if (ReductionOpc != Instruction::ICmp)
4740 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4741 CostKind);
4743 FastMathFlags(), CostKind);
4744 };
4745
4746 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4747 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4748
4749 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4750 << "\n OrigCost: " << OrigCost
4751 << " vs AltCost: " << AltCost << "\n");
4752
4753 if (AltCost >= OrigCost)
4754 return false;
4755
4756 Builder.SetInsertPoint(&I);
4757 Type *ScalarTy = VecTy->getScalarType();
4758 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
4759 Value *NewCmp =
4760 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
4761
4762 replaceValue(I, *NewCmp);
4763 return true;
4764}
4765
4766/// Returns true if this ShuffleVectorInst eventually feeds into a
4767/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4768/// chains of shuffles and binary operators (in any combination/order).
4769/// The search does not go deeper than the given Depth.
4771 constexpr unsigned MaxVisited = 32;
4774 bool FoundReduction = false;
4775
4776 WorkList.push_back(SVI);
4777 while (!WorkList.empty()) {
4778 Instruction *I = WorkList.pop_back_val();
4779 for (User *U : I->users()) {
4780 auto *UI = cast<Instruction>(U);
4781 if (!UI || !Visited.insert(UI).second)
4782 continue;
4783 if (Visited.size() > MaxVisited)
4784 return false;
4785 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4786 // More than one reduction reached
4787 if (FoundReduction)
4788 return false;
4789 switch (II->getIntrinsicID()) {
4790 case Intrinsic::vector_reduce_add:
4791 case Intrinsic::vector_reduce_mul:
4792 case Intrinsic::vector_reduce_and:
4793 case Intrinsic::vector_reduce_or:
4794 case Intrinsic::vector_reduce_xor:
4795 case Intrinsic::vector_reduce_smin:
4796 case Intrinsic::vector_reduce_smax:
4797 case Intrinsic::vector_reduce_umin:
4798 case Intrinsic::vector_reduce_umax:
4799 FoundReduction = true;
4800 continue;
4801 default:
4802 return false;
4803 }
4804 }
4805
4807 return false;
4808
4809 WorkList.emplace_back(UI);
4810 }
4811 }
4812 return FoundReduction;
4813}
4814
4815/// This method looks for groups of shuffles acting on binops, of the form:
4816/// %x = shuffle ...
4817/// %y = shuffle ...
4818/// %a = binop %x, %y
4819/// %b = binop %x, %y
4820/// shuffle %a, %b, selectmask
4821/// We may, especially if the shuffle is wider than legal, be able to convert
4822/// the shuffle to a form where only parts of a and b need to be computed. On
4823/// architectures with no obvious "select" shuffle, this can reduce the total
4824/// number of operations if the target reports them as cheaper.
4825bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4826 auto *SVI = cast<ShuffleVectorInst>(&I);
4827 auto *VT = cast<FixedVectorType>(I.getType());
4828 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4829 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4830 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4831 VT != Op0->getType())
4832 return false;
4833
4834 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4835 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4836 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4837 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4838 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4839 auto checkSVNonOpUses = [&](Instruction *I) {
4840 if (!I || I->getOperand(0)->getType() != VT)
4841 return true;
4842 return any_of(I->users(), [&](User *U) {
4843 return U != Op0 && U != Op1 &&
4844 !(isa<ShuffleVectorInst>(U) &&
4845 (InputShuffles.contains(cast<Instruction>(U)) ||
4846 isInstructionTriviallyDead(cast<Instruction>(U))));
4847 });
4848 };
4849 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4850 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4851 return false;
4852
4853 // Collect all the uses that are shuffles that we can transform together. We
4854 // may not have a single shuffle, but a group that can all be transformed
4855 // together profitably.
4857 auto collectShuffles = [&](Instruction *I) {
4858 for (auto *U : I->users()) {
4859 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4860 if (!SV || SV->getType() != VT)
4861 return false;
4862 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4863 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4864 return false;
4865 if (!llvm::is_contained(Shuffles, SV))
4866 Shuffles.push_back(SV);
4867 }
4868 return true;
4869 };
4870 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4871 return false;
4872 // From a reduction, we need to be processing a single shuffle, otherwise the
4873 // other uses will not be lane-invariant.
4874 if (FromReduction && Shuffles.size() > 1)
4875 return false;
4876
4877 // Add any shuffle uses for the shuffles we have found, to include them in our
4878 // cost calculations.
4879 if (!FromReduction) {
4880 for (ShuffleVectorInst *SV : Shuffles) {
4881 for (auto *U : SV->users()) {
4882 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4883 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4884 Shuffles.push_back(SSV);
4885 }
4886 }
4887 }
4888
4889 // For each of the output shuffles, we try to sort all the first vector
4890 // elements to the beginning, followed by the second array elements at the
4891 // end. If the binops are legalized to smaller vectors, this may reduce total
4892 // number of binops. We compute the ReconstructMask mask needed to convert
4893 // back to the original lane order.
4895 SmallVector<SmallVector<int>> OrigReconstructMasks;
4896 int MaxV1Elt = 0, MaxV2Elt = 0;
4897 unsigned NumElts = VT->getNumElements();
4898 for (ShuffleVectorInst *SVN : Shuffles) {
4899 SmallVector<int> Mask;
4900 SVN->getShuffleMask(Mask);
4901
4902 // Check the operands are the same as the original, or reversed (in which
4903 // case we need to commute the mask).
4904 Value *SVOp0 = SVN->getOperand(0);
4905 Value *SVOp1 = SVN->getOperand(1);
4906 if (isa<UndefValue>(SVOp1)) {
4907 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4908 SVOp0 = SSV->getOperand(0);
4909 SVOp1 = SSV->getOperand(1);
4910 for (int &Elem : Mask) {
4911 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4912 return false;
4913 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
4914 }
4915 }
4916 if (SVOp0 == Op1 && SVOp1 == Op0) {
4917 std::swap(SVOp0, SVOp1);
4919 }
4920 if (SVOp0 != Op0 || SVOp1 != Op1)
4921 return false;
4922
4923 // Calculate the reconstruction mask for this shuffle, as the mask needed to
4924 // take the packed values from Op0/Op1 and reconstructing to the original
4925 // order.
4926 SmallVector<int> ReconstructMask;
4927 for (unsigned I = 0; I < Mask.size(); I++) {
4928 if (Mask[I] < 0) {
4929 ReconstructMask.push_back(-1);
4930 } else if (Mask[I] < static_cast<int>(NumElts)) {
4931 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
4932 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
4933 return Mask[I] == A.first;
4934 });
4935 if (It != V1.end())
4936 ReconstructMask.push_back(It - V1.begin());
4937 else {
4938 ReconstructMask.push_back(V1.size());
4939 V1.emplace_back(Mask[I], V1.size());
4940 }
4941 } else {
4942 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
4943 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
4944 return Mask[I] - static_cast<int>(NumElts) == A.first;
4945 });
4946 if (It != V2.end())
4947 ReconstructMask.push_back(NumElts + It - V2.begin());
4948 else {
4949 ReconstructMask.push_back(NumElts + V2.size());
4950 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
4951 }
4952 }
4953 }
4954
4955 // For reductions, we know that the lane ordering out doesn't alter the
4956 // result. In-order can help simplify the shuffle away.
4957 if (FromReduction)
4958 sort(ReconstructMask);
4959 OrigReconstructMasks.push_back(std::move(ReconstructMask));
4960 }
4961
4962 // If the Maximum element used from V1 and V2 are not larger than the new
4963 // vectors, the vectors are already packes and performing the optimization
4964 // again will likely not help any further. This also prevents us from getting
4965 // stuck in a cycle in case the costs do not also rule it out.
4966 if (V1.empty() || V2.empty() ||
4967 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
4968 MaxV2Elt == static_cast<int>(V2.size()) - 1))
4969 return false;
4970
4971 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
4972 // shuffle of another shuffle, or not a shuffle (that is treated like a
4973 // identity shuffle).
4974 auto GetBaseMaskValue = [&](Instruction *I, int M) {
4975 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4976 if (!SV)
4977 return M;
4978 if (isa<UndefValue>(SV->getOperand(1)))
4979 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4980 if (InputShuffles.contains(SSV))
4981 return SSV->getMaskValue(SV->getMaskValue(M));
4982 return SV->getMaskValue(M);
4983 };
4984
4985 // Attempt to sort the inputs my ascending mask values to make simpler input
4986 // shuffles and push complex shuffles down to the uses. We sort on the first
4987 // of the two input shuffle orders, to try and get at least one input into a
4988 // nice order.
4989 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
4990 std::pair<int, int> Y) {
4991 int MXA = GetBaseMaskValue(A, X.first);
4992 int MYA = GetBaseMaskValue(A, Y.first);
4993 return MXA < MYA;
4994 };
4995 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
4996 return SortBase(SVI0A, A, B);
4997 });
4998 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
4999 return SortBase(SVI1A, A, B);
5000 });
5001 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5002 // modified order of the input shuffles.
5003 SmallVector<SmallVector<int>> ReconstructMasks;
5004 for (const auto &Mask : OrigReconstructMasks) {
5005 SmallVector<int> ReconstructMask;
5006 for (int M : Mask) {
5007 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5008 auto It = find_if(V, [M](auto A) { return A.second == M; });
5009 assert(It != V.end() && "Expected all entries in Mask");
5010 return std::distance(V.begin(), It);
5011 };
5012 if (M < 0)
5013 ReconstructMask.push_back(-1);
5014 else if (M < static_cast<int>(NumElts)) {
5015 ReconstructMask.push_back(FindIndex(V1, M));
5016 } else {
5017 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5018 }
5019 }
5020 ReconstructMasks.push_back(std::move(ReconstructMask));
5021 }
5022
5023 // Calculate the masks needed for the new input shuffles, which get padded
5024 // with undef
5025 SmallVector<int> V1A, V1B, V2A, V2B;
5026 for (unsigned I = 0; I < V1.size(); I++) {
5027 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5028 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5029 }
5030 for (unsigned I = 0; I < V2.size(); I++) {
5031 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5032 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5033 }
5034 while (V1A.size() < NumElts) {
5037 }
5038 while (V2A.size() < NumElts) {
5041 }
5042
5043 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5044 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5045 if (!SV)
5046 return C;
5047 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5050 VT, VT, SV->getShuffleMask(), CostKind);
5051 };
5052 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5053 return C +
5055 };
5056
5057 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5058 unsigned MaxVectorSize =
5060 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5061 if (MaxElementsInVector == 0)
5062 return false;
5063 // When there are multiple shufflevector operations on the same input,
5064 // especially when the vector length is larger than the register size,
5065 // identical shuffle patterns may occur across different groups of elements.
5066 // To avoid overestimating the cost by counting these repeated shuffles more
5067 // than once, we only account for unique shuffle patterns. This adjustment
5068 // prevents inflated costs in the cost model for wide vectors split into
5069 // several register-sized groups.
5070 std::set<SmallVector<int, 4>> UniqueShuffles;
5071 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5072 // Compute the cost for performing the shuffle over the full vector.
5073 auto ShuffleCost =
5075 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5076 if (NumFullVectors < 2)
5077 return C + ShuffleCost;
5078 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5079 unsigned NumUniqueGroups = 0;
5080 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5081 // For each group of MaxElementsInVector contiguous elements,
5082 // collect their shuffle pattern and insert into the set of unique patterns.
5083 for (unsigned I = 0; I < NumFullVectors; ++I) {
5084 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5085 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5086 if (UniqueShuffles.insert(SubShuffle).second)
5087 NumUniqueGroups += 1;
5088 }
5089 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5090 };
5091 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5092 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5093 if (!SV)
5094 return C;
5095 SmallVector<int, 16> Mask;
5096 SV->getShuffleMask(Mask);
5097 return AddShuffleMaskAdjustedCost(C, Mask);
5098 };
5099 // Check that input consists of ShuffleVectors applied to the same input
5100 auto AllShufflesHaveSameOperands =
5101 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5102 if (InputShuffles.size() < 2)
5103 return false;
5104 ShuffleVectorInst *FirstSV =
5105 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5106 if (!FirstSV)
5107 return false;
5108
5109 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5110 return std::all_of(
5111 std::next(InputShuffles.begin()), InputShuffles.end(),
5112 [&](Instruction *I) {
5113 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5114 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5115 });
5116 };
5117
5118 // Get the costs of the shuffles + binops before and after with the new
5119 // shuffle masks.
5120 InstructionCost CostBefore =
5121 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5122 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5123 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5124 InstructionCost(0), AddShuffleCost);
5125 if (AllShufflesHaveSameOperands(InputShuffles)) {
5126 UniqueShuffles.clear();
5127 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5128 InstructionCost(0), AddShuffleAdjustedCost);
5129 } else {
5130 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5131 InstructionCost(0), AddShuffleCost);
5132 }
5133
5134 // The new binops will be unused for lanes past the used shuffle lengths.
5135 // These types attempt to get the correct cost for that from the target.
5136 FixedVectorType *Op0SmallVT =
5137 FixedVectorType::get(VT->getScalarType(), V1.size());
5138 FixedVectorType *Op1SmallVT =
5139 FixedVectorType::get(VT->getScalarType(), V2.size());
5140 InstructionCost CostAfter =
5141 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5142 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5143 UniqueShuffles.clear();
5144 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5145 InstructionCost(0), AddShuffleMaskAdjustedCost);
5146 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5147 CostAfter +=
5148 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5149 InstructionCost(0), AddShuffleMaskCost);
5150
5151 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5152 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5153 << " vs CostAfter: " << CostAfter << "\n");
5154 if (CostBefore < CostAfter ||
5155 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5156 return false;
5157
5158 // The cost model has passed, create the new instructions.
5159 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5160 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5161 if (!SV)
5162 return I;
5163 if (isa<UndefValue>(SV->getOperand(1)))
5164 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5165 if (InputShuffles.contains(SSV))
5166 return SSV->getOperand(Op);
5167 return SV->getOperand(Op);
5168 };
5169 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5170 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5171 GetShuffleOperand(SVI0A, 1), V1A);
5172 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5173 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5174 GetShuffleOperand(SVI0B, 1), V1B);
5175 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5176 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5177 GetShuffleOperand(SVI1A, 1), V2A);
5178 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5179 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5180 GetShuffleOperand(SVI1B, 1), V2B);
5181 Builder.SetInsertPoint(Op0);
5182 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5183 NSV0A, NSV0B);
5184 if (auto *I = dyn_cast<Instruction>(NOp0))
5185 I->copyIRFlags(Op0, true);
5186 Builder.SetInsertPoint(Op1);
5187 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5188 NSV1A, NSV1B);
5189 if (auto *I = dyn_cast<Instruction>(NOp1))
5190 I->copyIRFlags(Op1, true);
5191
5192 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5193 Builder.SetInsertPoint(Shuffles[S]);
5194 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5195 replaceValue(*Shuffles[S], *NSV, false);
5196 }
5197
5198 Worklist.pushValue(NSV0A);
5199 Worklist.pushValue(NSV0B);
5200 Worklist.pushValue(NSV1A);
5201 Worklist.pushValue(NSV1B);
5202 return true;
5203}
5204
5205/// Check if instruction depends on ZExt and this ZExt can be moved after the
5206/// instruction. Move ZExt if it is profitable. For example:
5207/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5208/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5209/// Cost model calculations takes into account if zext(x) has other users and
5210/// whether it can be propagated through them too.
5211bool VectorCombine::shrinkType(Instruction &I) {
5212 Value *ZExted, *OtherOperand;
5213 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5214 m_Value(OtherOperand))) &&
5215 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5216 return false;
5217
5218 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5219
5220 auto *BigTy = cast<FixedVectorType>(I.getType());
5221 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5222 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5223
5224 if (I.getOpcode() == Instruction::LShr) {
5225 // Check that the shift amount is less than the number of bits in the
5226 // smaller type. Otherwise, the smaller lshr will return a poison value.
5227 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5228 if (ShAmtKB.getMaxValue().uge(BW))
5229 return false;
5230 } else {
5231 // Check that the expression overall uses at most the same number of bits as
5232 // ZExted
5233 KnownBits KB = computeKnownBits(&I, *DL);
5234 if (KB.countMaxActiveBits() > BW)
5235 return false;
5236 }
5237
5238 // Calculate costs of leaving current IR as it is and moving ZExt operation
5239 // later, along with adding truncates if needed
5241 Instruction::ZExt, BigTy, SmallTy,
5242 TargetTransformInfo::CastContextHint::None, CostKind);
5243 InstructionCost CurrentCost = ZExtCost;
5244 InstructionCost ShrinkCost = 0;
5245
5246 // Calculate total cost and check that we can propagate through all ZExt users
5247 for (User *U : ZExtOperand->users()) {
5248 auto *UI = cast<Instruction>(U);
5249 if (UI == &I) {
5250 CurrentCost +=
5251 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5252 ShrinkCost +=
5253 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5254 ShrinkCost += ZExtCost;
5255 continue;
5256 }
5257
5258 if (!Instruction::isBinaryOp(UI->getOpcode()))
5259 return false;
5260
5261 // Check if we can propagate ZExt through its other users
5262 KnownBits KB = computeKnownBits(UI, *DL);
5263 if (KB.countMaxActiveBits() > BW)
5264 return false;
5265
5266 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5267 ShrinkCost +=
5268 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5269 ShrinkCost += ZExtCost;
5270 }
5271
5272 // If the other instruction operand is not a constant, we'll need to
5273 // generate a truncate instruction. So we have to adjust cost
5274 if (!isa<Constant>(OtherOperand))
5275 ShrinkCost += TTI.getCastInstrCost(
5276 Instruction::Trunc, SmallTy, BigTy,
5277 TargetTransformInfo::CastContextHint::None, CostKind);
5278
5279 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5280 // towards modifying the IR because shrinking opens opportunities for other
5281 // shrinking optimisations.
5282 if (ShrinkCost > CurrentCost)
5283 return false;
5284
5285 Builder.SetInsertPoint(&I);
5286 Value *Op0 = ZExted;
5287 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5288 // Keep the order of operands the same
5289 if (I.getOperand(0) == OtherOperand)
5290 std::swap(Op0, Op1);
5291 Value *NewBinOp =
5292 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5293 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5294 cast<Instruction>(NewBinOp)->copyMetadata(I);
5295 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5296 replaceValue(I, *NewZExtr);
5297 return true;
5298}
5299
5300/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5301/// shuffle (DstVec, SrcVec, Mask)
5302bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5303 Value *DstVec, *SrcVec;
5304 uint64_t ExtIdx, InsIdx;
5305 if (!match(&I,
5306 m_InsertElt(m_Value(DstVec),
5307 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5308 m_ConstantInt(InsIdx))))
5309 return false;
5310
5311 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5312 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5313 // We can try combining vectors with different element sizes.
5314 if (!DstVecTy || !SrcVecTy ||
5315 SrcVecTy->getElementType() != DstVecTy->getElementType())
5316 return false;
5317
5318 unsigned NumDstElts = DstVecTy->getNumElements();
5319 unsigned NumSrcElts = SrcVecTy->getNumElements();
5320 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5321 return false;
5322
5323 // Insertion into poison is a cheaper single operand shuffle.
5325 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5326
5327 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5328 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5329 if (NeedDstSrcSwap) {
5331 Mask[InsIdx] = ExtIdx % NumDstElts;
5332 std::swap(DstVec, SrcVec);
5333 } else {
5335 std::iota(Mask.begin(), Mask.end(), 0);
5336 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5337 }
5338
5339 // Cost
5340 auto *Ins = cast<InsertElementInst>(&I);
5341 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5342 InstructionCost InsCost =
5343 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5344 InstructionCost ExtCost =
5345 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5346 InstructionCost OldCost = ExtCost + InsCost;
5347
5348 InstructionCost NewCost = 0;
5349 SmallVector<int> ExtToVecMask;
5350 if (!NeedExpOrNarrow) {
5351 // Ignore 'free' identity insertion shuffle.
5352 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5353 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5354 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5355 nullptr, {DstVec, SrcVec});
5356 } else {
5357 // When creating a length-changing-vector, always try to keep the relevant
5358 // element in an equivalent position, so that bulk shuffles are more likely
5359 // to be useful.
5360 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5361 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5362 // Add cost for expanding or narrowing
5364 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5365 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5366 }
5367
5368 if (!Ext->hasOneUse())
5369 NewCost += ExtCost;
5370
5371 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5372 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5373 << "\n");
5374
5375 if (OldCost < NewCost)
5376 return false;
5377
5378 if (NeedExpOrNarrow) {
5379 if (!NeedDstSrcSwap)
5380 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5381 else
5382 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5383 }
5384
5385 // Canonicalize undef param to RHS to help further folds.
5386 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5387 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5388 std::swap(DstVec, SrcVec);
5389 }
5390
5391 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5392 replaceValue(I, *Shuf);
5393
5394 return true;
5395}
5396
5397/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5398/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5399/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5400/// before casting it back into `<vscale x 16 x i32>`.
5401bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5402 const APInt *SplatVal0, *SplatVal1;
5404 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5405 return false;
5406
5407 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5408 << "\n");
5409
5410 auto *VTy =
5411 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5412 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5413 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5414
5415 // Just in case the cost of interleave2 intrinsic and bitcast are both
5416 // invalid, in which case we want to bail out, we use <= rather
5417 // than < here. Even they both have valid and equal costs, it's probably
5418 // not a good idea to emit a high-cost constant splat.
5420 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5422 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5423 << *I.getType() << " is too high.\n");
5424 return false;
5425 }
5426
5427 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5428 NewSplatVal <<= Width;
5429 NewSplatVal |= SplatVal0->zext(Width * 2);
5430 auto *NewSplat = ConstantVector::getSplat(
5431 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5432
5433 IRBuilder<> Builder(&I);
5434 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5435 return true;
5436}
5437
5438// Attempt to shrink loads that are only used by shufflevector instructions.
5439bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
5440 auto *OldLoad = dyn_cast<LoadInst>(&I);
5441 if (!OldLoad || !OldLoad->isSimple())
5442 return false;
5443
5444 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
5445 if (!OldLoadTy)
5446 return false;
5447
5448 unsigned const OldNumElements = OldLoadTy->getNumElements();
5449
5450 // Search all uses of load. If all uses are shufflevector instructions, and
5451 // the second operands are all poison values, find the minimum and maximum
5452 // indices of the vector elements referenced by all shuffle masks.
5453 // Otherwise return `std::nullopt`.
5454 using IndexRange = std::pair<int, int>;
5455 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
5456 IndexRange OutputRange = IndexRange(OldNumElements, -1);
5457 for (llvm::Use &Use : I.uses()) {
5458 // Ensure all uses match the required pattern.
5459 User *Shuffle = Use.getUser();
5460 ArrayRef<int> Mask;
5461
5462 if (!match(Shuffle,
5463 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
5464 return std::nullopt;
5465
5466 // Ignore shufflevector instructions that have no uses.
5467 if (Shuffle->use_empty())
5468 continue;
5469
5470 // Find the min and max indices used by the shufflevector instruction.
5471 for (int Index : Mask) {
5472 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
5473 OutputRange.first = std::min(Index, OutputRange.first);
5474 OutputRange.second = std::max(Index, OutputRange.second);
5475 }
5476 }
5477 }
5478
5479 if (OutputRange.second < OutputRange.first)
5480 return std::nullopt;
5481
5482 return OutputRange;
5483 };
5484
5485 // Get the range of vector elements used by shufflevector instructions.
5486 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
5487 unsigned const NewNumElements = Indices->second + 1u;
5488
5489 // If the range of vector elements is smaller than the full load, attempt
5490 // to create a smaller load.
5491 if (NewNumElements < OldNumElements) {
5492 IRBuilder Builder(&I);
5493 Builder.SetCurrentDebugLocation(I.getDebugLoc());
5494
5495 // Calculate costs of old and new ops.
5496 Type *ElemTy = OldLoadTy->getElementType();
5497 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
5498 Value *PtrOp = OldLoad->getPointerOperand();
5499
5501 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
5502 OldLoad->getPointerAddressSpace(), CostKind);
5503 InstructionCost NewCost =
5504 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
5505 OldLoad->getPointerAddressSpace(), CostKind);
5506
5507 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
5509 unsigned const MaxIndex = NewNumElements * 2u;
5510
5511 for (llvm::Use &Use : I.uses()) {
5512 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
5513
5514 // Ignore shufflevector instructions that have no uses.
5515 if (Shuffle->use_empty())
5516 continue;
5517
5518 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
5519
5520 // Create entry for new use.
5521 NewUses.push_back({Shuffle, OldMask});
5522
5523 // Validate mask indices.
5524 for (int Index : OldMask) {
5525 if (Index >= static_cast<int>(MaxIndex))
5526 return false;
5527 }
5528
5529 // Update costs.
5530 OldCost +=
5532 OldLoadTy, OldMask, CostKind);
5533 NewCost +=
5535 NewLoadTy, OldMask, CostKind);
5536 }
5537
5538 LLVM_DEBUG(
5539 dbgs() << "Found a load used only by shufflevector instructions: "
5540 << I << "\n OldCost: " << OldCost
5541 << " vs NewCost: " << NewCost << "\n");
5542
5543 if (OldCost < NewCost || !NewCost.isValid())
5544 return false;
5545
5546 // Create new load of smaller vector.
5547 auto *NewLoad = cast<LoadInst>(
5548 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
5549 NewLoad->copyMetadata(I);
5550
5551 // Replace all uses.
5552 for (UseEntry &Use : NewUses) {
5553 ShuffleVectorInst *Shuffle = Use.first;
5554 std::vector<int> &NewMask = Use.second;
5555
5556 Builder.SetInsertPoint(Shuffle);
5557 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
5558 Value *NewShuffle = Builder.CreateShuffleVector(
5559 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
5560
5561 replaceValue(*Shuffle, *NewShuffle, false);
5562 }
5563
5564 return true;
5565 }
5566 }
5567 return false;
5568}
5569
5570// Attempt to narrow a phi of shufflevector instructions where the two incoming
5571// values have the same operands but different masks. If the two shuffle masks
5572// are offsets of one another we can use one branch to rotate the incoming
5573// vector and perform one larger shuffle after the phi.
5574bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
5575 auto *Phi = dyn_cast<PHINode>(&I);
5576 if (!Phi || Phi->getNumIncomingValues() != 2u)
5577 return false;
5578
5579 Value *Op = nullptr;
5580 ArrayRef<int> Mask0;
5581 ArrayRef<int> Mask1;
5582
5583 if (!match(Phi->getOperand(0u),
5584 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
5585 !match(Phi->getOperand(1u),
5586 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
5587 return false;
5588
5589 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
5590
5591 // Ensure result vectors are wider than the argument vector.
5592 auto *InputVT = cast<FixedVectorType>(Op->getType());
5593 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
5594 auto const InputNumElements = InputVT->getNumElements();
5595
5596 if (InputNumElements >= ResultVT->getNumElements())
5597 return false;
5598
5599 // Take the difference of the two shuffle masks at each index. Ignore poison
5600 // values at the same index in both masks.
5601 SmallVector<int, 16> NewMask;
5602 NewMask.reserve(Mask0.size());
5603
5604 for (auto [M0, M1] : zip(Mask0, Mask1)) {
5605 if (M0 >= 0 && M1 >= 0)
5606 NewMask.push_back(M0 - M1);
5607 else if (M0 == -1 && M1 == -1)
5608 continue;
5609 else
5610 return false;
5611 }
5612
5613 // Ensure all elements of the new mask are equal. If the difference between
5614 // the incoming mask elements is the same, the two must be constant offsets
5615 // of one another.
5616 if (NewMask.empty() || !all_equal(NewMask))
5617 return false;
5618
5619 // Create new mask using difference of the two incoming masks.
5620 int MaskOffset = NewMask[0u];
5621 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
5622 NewMask.clear();
5623
5624 for (unsigned I = 0u; I < InputNumElements; ++I) {
5625 NewMask.push_back(Index);
5626 Index = (Index + 1u) % InputNumElements;
5627 }
5628
5629 // Calculate costs for worst cases and compare.
5630 auto const Kind = TTI::SK_PermuteSingleSrc;
5631 auto OldCost =
5632 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
5633 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
5634 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
5635 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
5636
5637 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
5638 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5639 << "\n");
5640
5641 if (NewCost > OldCost)
5642 return false;
5643
5644 // Create new shuffles and narrowed phi.
5645 auto Builder = IRBuilder(Shuf);
5646 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
5647 auto *PoisonVal = PoisonValue::get(InputVT);
5648 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
5649 Worklist.push(cast<Instruction>(NewShuf0));
5650
5651 Builder.SetInsertPoint(Phi);
5652 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
5653 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
5654 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
5655 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
5656
5657 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
5658 PoisonVal = PoisonValue::get(NewPhi->getType());
5659 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
5660
5661 replaceValue(*Phi, *NewShuf1);
5662 return true;
5663}
5664
5665/// This is the entry point for all transforms. Pass manager differences are
5666/// handled in the callers of this function.
5667bool VectorCombine::run() {
5669 return false;
5670
5671 // Don't attempt vectorization if the target does not support vectors.
5672 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
5673 return false;
5674
5675 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
5676
5677 auto FoldInst = [this](Instruction &I) {
5678 Builder.SetInsertPoint(&I);
5679 bool IsVectorType = isa<VectorType>(I.getType());
5680 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
5681 auto Opcode = I.getOpcode();
5682
5683 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
5684
5685 // These folds should be beneficial regardless of when this pass is run
5686 // in the optimization pipeline.
5687 // The type checking is for run-time efficiency. We can avoid wasting time
5688 // dispatching to folding functions if there's no chance of matching.
5689 if (IsFixedVectorType) {
5690 switch (Opcode) {
5691 case Instruction::InsertElement:
5692 if (vectorizeLoadInsert(I))
5693 return true;
5694 break;
5695 case Instruction::ShuffleVector:
5696 if (widenSubvectorLoad(I))
5697 return true;
5698 break;
5699 default:
5700 break;
5701 }
5702 }
5703
5704 // This transform works with scalable and fixed vectors
5705 // TODO: Identify and allow other scalable transforms
5706 if (IsVectorType) {
5707 if (scalarizeOpOrCmp(I))
5708 return true;
5709 if (scalarizeLoad(I))
5710 return true;
5711 if (scalarizeExtExtract(I))
5712 return true;
5713 if (scalarizeVPIntrinsic(I))
5714 return true;
5715 if (foldInterleaveIntrinsics(I))
5716 return true;
5717 }
5718
5719 if (Opcode == Instruction::Store)
5720 if (foldSingleElementStore(I))
5721 return true;
5722
5723 // If this is an early pipeline invocation of this pass, we are done.
5724 if (TryEarlyFoldsOnly)
5725 return false;
5726
5727 // Otherwise, try folds that improve codegen but may interfere with
5728 // early IR canonicalizations.
5729 // The type checking is for run-time efficiency. We can avoid wasting time
5730 // dispatching to folding functions if there's no chance of matching.
5731 if (IsFixedVectorType) {
5732 switch (Opcode) {
5733 case Instruction::InsertElement:
5734 if (foldInsExtFNeg(I))
5735 return true;
5736 if (foldInsExtBinop(I))
5737 return true;
5738 if (foldInsExtVectorToShuffle(I))
5739 return true;
5740 break;
5741 case Instruction::ShuffleVector:
5742 if (foldPermuteOfBinops(I))
5743 return true;
5744 if (foldShuffleOfBinops(I))
5745 return true;
5746 if (foldShuffleOfSelects(I))
5747 return true;
5748 if (foldShuffleOfCastops(I))
5749 return true;
5750 if (foldShuffleOfShuffles(I))
5751 return true;
5752 if (foldPermuteOfIntrinsic(I))
5753 return true;
5754 if (foldShufflesOfLengthChangingShuffles(I))
5755 return true;
5756 if (foldShuffleOfIntrinsics(I))
5757 return true;
5758 if (foldSelectShuffle(I))
5759 return true;
5760 if (foldShuffleToIdentity(I))
5761 return true;
5762 break;
5763 case Instruction::Load:
5764 if (shrinkLoadForShuffles(I))
5765 return true;
5766 break;
5767 case Instruction::BitCast:
5768 if (foldBitcastShuffle(I))
5769 return true;
5770 if (foldSelectsFromBitcast(I))
5771 return true;
5772 break;
5773 case Instruction::And:
5774 case Instruction::Or:
5775 case Instruction::Xor:
5776 if (foldBitOpOfCastops(I))
5777 return true;
5778 if (foldBitOpOfCastConstant(I))
5779 return true;
5780 break;
5781 case Instruction::PHI:
5782 if (shrinkPhiOfShuffles(I))
5783 return true;
5784 break;
5785 default:
5786 if (shrinkType(I))
5787 return true;
5788 break;
5789 }
5790 } else {
5791 switch (Opcode) {
5792 case Instruction::Call:
5793 if (foldShuffleFromReductions(I))
5794 return true;
5795 if (foldCastFromReductions(I))
5796 return true;
5797 break;
5798 case Instruction::ExtractElement:
5799 if (foldShuffleChainsToReduce(I))
5800 return true;
5801 break;
5802 case Instruction::ICmp:
5803 if (foldSignBitReductionCmp(I))
5804 return true;
5805 if (foldICmpEqZeroVectorReduce(I))
5806 return true;
5807 if (foldEquivalentReductionCmp(I))
5808 return true;
5809 [[fallthrough]];
5810 case Instruction::FCmp:
5811 if (foldExtractExtract(I))
5812 return true;
5813 break;
5814 case Instruction::Or:
5815 if (foldConcatOfBoolMasks(I))
5816 return true;
5817 [[fallthrough]];
5818 default:
5819 if (Instruction::isBinaryOp(Opcode)) {
5820 if (foldExtractExtract(I))
5821 return true;
5822 if (foldExtractedCmps(I))
5823 return true;
5824 if (foldBinopOfReductions(I))
5825 return true;
5826 }
5827 break;
5828 }
5829 }
5830 return false;
5831 };
5832
5833 bool MadeChange = false;
5834 for (BasicBlock &BB : F) {
5835 // Ignore unreachable basic blocks.
5836 if (!DT.isReachableFromEntry(&BB))
5837 continue;
5838 // Use early increment range so that we can erase instructions in loop.
5839 // make_early_inc_range is not applicable here, as the next iterator may
5840 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5841 // We manually maintain the next instruction and update it when it is about
5842 // to be deleted.
5843 Instruction *I = &BB.front();
5844 while (I) {
5845 NextInst = I->getNextNode();
5846 if (!I->isDebugOrPseudoInst())
5847 MadeChange |= FoldInst(*I);
5848 I = NextInst;
5849 }
5850 }
5851
5852 NextInst = nullptr;
5853
5854 while (!Worklist.isEmpty()) {
5855 Instruction *I = Worklist.removeOne();
5856 if (!I)
5857 continue;
5858
5861 continue;
5862 }
5863
5864 MadeChange |= FoldInst(*I);
5865 }
5866
5867 return MadeChange;
5868}
5869
5872 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5874 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5875 AAResults &AA = FAM.getResult<AAManager>(F);
5876 const DataLayout *DL = &F.getDataLayout();
5877 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5878 TryEarlyFoldsOnly);
5879 if (!Combiner.run())
5880 return PreservedAnalyses::all();
5883 return PA;
5884}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1449
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1023
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1630
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2561
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2549
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1871
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1516
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2210
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2651
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1952
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2235
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2442
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2473
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2646
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2176
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1854
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1495
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2054
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2583
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1554
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1867
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2040
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:604
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1711
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1802
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2418
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1576
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:760
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:166
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:962
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2263
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2268
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2116
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:538
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:406
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:312
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
SimplifyQuery getWithInstruction(const Instruction *I) const