LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoad(Instruction &I);
133 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
134 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeExtExtract(Instruction &I);
136 bool foldConcatOfBoolMasks(Instruction &I);
137 bool foldPermuteOfBinops(Instruction &I);
138 bool foldShuffleOfBinops(Instruction &I);
139 bool foldShuffleOfSelects(Instruction &I);
140 bool foldShuffleOfCastops(Instruction &I);
141 bool foldShuffleOfShuffles(Instruction &I);
142 bool foldPermuteOfIntrinsic(Instruction &I);
143 bool foldShuffleOfIntrinsics(Instruction &I);
144 bool foldShuffleToIdentity(Instruction &I);
145 bool foldShuffleFromReductions(Instruction &I);
146 bool foldShuffleChainsToReduce(Instruction &I);
147 bool foldCastFromReductions(Instruction &I);
148 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
149 bool foldInterleaveIntrinsics(Instruction &I);
150 bool shrinkType(Instruction &I);
151 bool shrinkLoadForShuffles(Instruction &I);
152 bool shrinkPhiOfShuffles(Instruction &I);
153
154 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
155 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
156 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
157 Old.replaceAllUsesWith(&New);
158 if (auto *NewI = dyn_cast<Instruction>(&New)) {
159 New.takeName(&Old);
160 Worklist.pushUsersToWorkList(*NewI);
161 Worklist.pushValue(NewI);
162 }
163 if (Erase && isInstructionTriviallyDead(&Old)) {
164 eraseInstruction(Old);
165 } else {
166 Worklist.push(&Old);
167 }
168 }
169
170 void eraseInstruction(Instruction &I) {
171 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
172 SmallVector<Value *> Ops(I.operands());
173 Worklist.remove(&I);
174 I.eraseFromParent();
175
176 // Push remaining users of the operands and then the operand itself - allows
177 // further folds that were hindered by OneUse limits.
178 SmallPtrSet<Value *, 4> Visited;
179 for (Value *Op : Ops) {
180 if (!Visited.contains(Op)) {
181 if (auto *OpI = dyn_cast<Instruction>(Op)) {
183 OpI, nullptr, nullptr, [&](Value *V) {
184 if (auto *I = dyn_cast<Instruction>(V)) {
185 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
186 Worklist.remove(I);
187 if (I == NextInst)
188 NextInst = NextInst->getNextNode();
189 Visited.insert(I);
190 }
191 }))
192 continue;
193 Worklist.pushUsersToWorkList(*OpI);
194 Worklist.pushValue(OpI);
195 }
196 }
197 }
198 }
199};
200} // namespace
201
202/// Return the source operand of a potentially bitcasted value. If there is no
203/// bitcast, return the input value itself.
205 while (auto *BitCast = dyn_cast<BitCastInst>(V))
206 V = BitCast->getOperand(0);
207 return V;
208}
209
210static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
211 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
212 // The widened load may load data from dirty regions or create data races
213 // non-existent in the source.
214 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
215 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
217 return false;
218
219 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
220 // sure we have all of our type-based constraints in place for this target.
221 Type *ScalarTy = Load->getType()->getScalarType();
222 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
223 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
224 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
225 ScalarSize % 8 != 0)
226 return false;
227
228 return true;
229}
230
231bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
232 // Match insert into fixed vector of scalar value.
233 // TODO: Handle non-zero insert index.
234 Value *Scalar;
235 if (!match(&I,
237 return false;
238
239 // Optionally match an extract from another vector.
240 Value *X;
241 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
242 if (!HasExtract)
243 X = Scalar;
244
245 auto *Load = dyn_cast<LoadInst>(X);
246 if (!canWidenLoad(Load, TTI))
247 return false;
248
249 Type *ScalarTy = Scalar->getType();
250 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
251 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
252
253 // Check safety of replacing the scalar load with a larger vector load.
254 // We use minimal alignment (maximum flexibility) because we only care about
255 // the dereferenceable region. When calculating cost and creating a new op,
256 // we may use a larger value based on alignment attributes.
257 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
258 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
259
260 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
261 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
262 unsigned OffsetEltIndex = 0;
263 Align Alignment = Load->getAlign();
264 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
265 &DT)) {
266 // It is not safe to load directly from the pointer, but we can still peek
267 // through gep offsets and check if it safe to load from a base address with
268 // updated alignment. If it is, we can shuffle the element(s) into place
269 // after loading.
270 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
271 APInt Offset(OffsetBitWidth, 0);
273
274 // We want to shuffle the result down from a high element of a vector, so
275 // the offset must be positive.
276 if (Offset.isNegative())
277 return false;
278
279 // The offset must be a multiple of the scalar element to shuffle cleanly
280 // in the element's size.
281 uint64_t ScalarSizeInBytes = ScalarSize / 8;
282 if (Offset.urem(ScalarSizeInBytes) != 0)
283 return false;
284
285 // If we load MinVecNumElts, will our target element still be loaded?
286 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
287 if (OffsetEltIndex >= MinVecNumElts)
288 return false;
289
290 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
291 &DT))
292 return false;
293
294 // Update alignment with offset value. Note that the offset could be negated
295 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
296 // negation does not change the result of the alignment calculation.
297 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
298 }
299
300 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
301 // Use the greater of the alignment on the load or its source pointer.
302 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
303 Type *LoadTy = Load->getType();
304 unsigned AS = Load->getPointerAddressSpace();
305 InstructionCost OldCost =
306 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
307 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
308 OldCost +=
309 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
310 /* Insert */ true, HasExtract, CostKind);
311
312 // New pattern: load VecPtr
313 InstructionCost NewCost =
314 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
315 // Optionally, we are shuffling the loaded vector element(s) into place.
316 // For the mask set everything but element 0 to undef to prevent poison from
317 // propagating from the extra loaded memory. This will also optionally
318 // shrink/grow the vector from the loaded size to the output size.
319 // We assume this operation has no cost in codegen if there was no offset.
320 // Note that we could use freeze to avoid poison problems, but then we might
321 // still need a shuffle to change the vector size.
322 auto *Ty = cast<FixedVectorType>(I.getType());
323 unsigned OutputNumElts = Ty->getNumElements();
324 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
325 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
326 Mask[0] = OffsetEltIndex;
327 if (OffsetEltIndex)
328 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
329 CostKind);
330
331 // We can aggressively convert to the vector form because the backend can
332 // invert this transform if it does not result in a performance win.
333 if (OldCost < NewCost || !NewCost.isValid())
334 return false;
335
336 // It is safe and potentially profitable to load a vector directly:
337 // inselt undef, load Scalar, 0 --> load VecPtr
338 IRBuilder<> Builder(Load);
339 Value *CastedPtr =
340 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
341 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
342 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
343
344 replaceValue(I, *VecLd);
345 ++NumVecLoad;
346 return true;
347}
348
349/// If we are loading a vector and then inserting it into a larger vector with
350/// undefined elements, try to load the larger vector and eliminate the insert.
351/// This removes a shuffle in IR and may allow combining of other loaded values.
352bool VectorCombine::widenSubvectorLoad(Instruction &I) {
353 // Match subvector insert of fixed vector.
354 auto *Shuf = cast<ShuffleVectorInst>(&I);
355 if (!Shuf->isIdentityWithPadding())
356 return false;
357
358 // Allow a non-canonical shuffle mask that is choosing elements from op1.
359 unsigned NumOpElts =
360 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
361 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
362 return M >= (int)(NumOpElts);
363 });
364
365 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
366 if (!canWidenLoad(Load, TTI))
367 return false;
368
369 // We use minimal alignment (maximum flexibility) because we only care about
370 // the dereferenceable region. When calculating cost and creating a new op,
371 // we may use a larger value based on alignment attributes.
372 auto *Ty = cast<FixedVectorType>(I.getType());
373 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
374 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
375 Align Alignment = Load->getAlign();
376 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
377 return false;
378
379 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
380 Type *LoadTy = Load->getType();
381 unsigned AS = Load->getPointerAddressSpace();
382
383 // Original pattern: insert_subvector (load PtrOp)
384 // This conservatively assumes that the cost of a subvector insert into an
385 // undef value is 0. We could add that cost if the cost model accurately
386 // reflects the real cost of that operation.
387 InstructionCost OldCost =
388 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
389
390 // New pattern: load PtrOp
391 InstructionCost NewCost =
392 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
393
394 // We can aggressively convert to the vector form because the backend can
395 // invert this transform if it does not result in a performance win.
396 if (OldCost < NewCost || !NewCost.isValid())
397 return false;
398
399 IRBuilder<> Builder(Load);
400 Value *CastedPtr =
401 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
402 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
403 replaceValue(I, *VecLd);
404 ++NumVecLoad;
405 return true;
406}
407
408/// Determine which, if any, of the inputs should be replaced by a shuffle
409/// followed by extract from a different index.
410ExtractElementInst *VectorCombine::getShuffleExtract(
411 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
412 unsigned PreferredExtractIndex = InvalidIndex) const {
413 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
414 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
415 assert(Index0C && Index1C && "Expected constant extract indexes");
416
417 unsigned Index0 = Index0C->getZExtValue();
418 unsigned Index1 = Index1C->getZExtValue();
419
420 // If the extract indexes are identical, no shuffle is needed.
421 if (Index0 == Index1)
422 return nullptr;
423
424 Type *VecTy = Ext0->getVectorOperand()->getType();
425 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
426 InstructionCost Cost0 =
427 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
428 InstructionCost Cost1 =
429 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
430
431 // If both costs are invalid no shuffle is needed
432 if (!Cost0.isValid() && !Cost1.isValid())
433 return nullptr;
434
435 // We are extracting from 2 different indexes, so one operand must be shuffled
436 // before performing a vector operation and/or extract. The more expensive
437 // extract will be replaced by a shuffle.
438 if (Cost0 > Cost1)
439 return Ext0;
440 if (Cost1 > Cost0)
441 return Ext1;
442
443 // If the costs are equal and there is a preferred extract index, shuffle the
444 // opposite operand.
445 if (PreferredExtractIndex == Index0)
446 return Ext1;
447 if (PreferredExtractIndex == Index1)
448 return Ext0;
449
450 // Otherwise, replace the extract with the higher index.
451 return Index0 > Index1 ? Ext0 : Ext1;
452}
453
454/// Compare the relative costs of 2 extracts followed by scalar operation vs.
455/// vector operation(s) followed by extract. Return true if the existing
456/// instructions are cheaper than a vector alternative. Otherwise, return false
457/// and if one of the extracts should be transformed to a shufflevector, set
458/// \p ConvertToShuffle to that extract instruction.
459bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
460 ExtractElementInst *Ext1,
461 const Instruction &I,
462 ExtractElementInst *&ConvertToShuffle,
463 unsigned PreferredExtractIndex) {
464 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
465 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
466 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
467
468 unsigned Opcode = I.getOpcode();
469 Value *Ext0Src = Ext0->getVectorOperand();
470 Value *Ext1Src = Ext1->getVectorOperand();
471 Type *ScalarTy = Ext0->getType();
472 auto *VecTy = cast<VectorType>(Ext0Src->getType());
473 InstructionCost ScalarOpCost, VectorOpCost;
474
475 // Get cost estimates for scalar and vector versions of the operation.
476 bool IsBinOp = Instruction::isBinaryOp(Opcode);
477 if (IsBinOp) {
478 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
479 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
480 } else {
481 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
482 "Expected a compare");
483 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
484 ScalarOpCost = TTI.getCmpSelInstrCost(
485 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
486 VectorOpCost = TTI.getCmpSelInstrCost(
487 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
488 }
489
490 // Get cost estimates for the extract elements. These costs will factor into
491 // both sequences.
492 unsigned Ext0Index = Ext0IndexC->getZExtValue();
493 unsigned Ext1Index = Ext1IndexC->getZExtValue();
494
495 InstructionCost Extract0Cost =
496 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
497 InstructionCost Extract1Cost =
498 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
499
500 // A more expensive extract will always be replaced by a splat shuffle.
501 // For example, if Ext0 is more expensive:
502 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
503 // extelt (opcode (splat V0, Ext0), V1), Ext1
504 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
505 // check the cost of creating a broadcast shuffle and shuffling both
506 // operands to element 0.
507 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
508 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
509 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
510
511 // Extra uses of the extracts mean that we include those costs in the
512 // vector total because those instructions will not be eliminated.
513 InstructionCost OldCost, NewCost;
514 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
515 // Handle a special case. If the 2 extracts are identical, adjust the
516 // formulas to account for that. The extra use charge allows for either the
517 // CSE'd pattern or an unoptimized form with identical values:
518 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
519 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
520 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
521 OldCost = CheapExtractCost + ScalarOpCost;
522 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
523 } else {
524 // Handle the general case. Each extract is actually a different value:
525 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
526 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
527 NewCost = VectorOpCost + CheapExtractCost +
528 !Ext0->hasOneUse() * Extract0Cost +
529 !Ext1->hasOneUse() * Extract1Cost;
530 }
531
532 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
533 if (ConvertToShuffle) {
534 if (IsBinOp && DisableBinopExtractShuffle)
535 return true;
536
537 // If we are extracting from 2 different indexes, then one operand must be
538 // shuffled before performing the vector operation. The shuffle mask is
539 // poison except for 1 lane that is being translated to the remaining
540 // extraction lane. Therefore, it is a splat shuffle. Ex:
541 // ShufMask = { poison, poison, 0, poison }
542 // TODO: The cost model has an option for a "broadcast" shuffle
543 // (splat-from-element-0), but no option for a more general splat.
544 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
545 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
547 ShuffleMask[BestInsIndex] = BestExtIndex;
549 VecTy, VecTy, ShuffleMask, CostKind, 0,
550 nullptr, {ConvertToShuffle});
551 } else {
553 VecTy, VecTy, {}, CostKind, 0, nullptr,
554 {ConvertToShuffle});
555 }
556 }
557
558 // Aggressively form a vector op if the cost is equal because the transform
559 // may enable further optimization.
560 // Codegen can reverse this transform (scalarize) if it was not profitable.
561 return OldCost < NewCost;
562}
563
564/// Create a shuffle that translates (shifts) 1 element from the input vector
565/// to a new element location.
566static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
567 unsigned NewIndex, IRBuilderBase &Builder) {
568 // The shuffle mask is poison except for 1 lane that is being translated
569 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
570 // ShufMask = { 2, poison, poison, poison }
571 auto *VecTy = cast<FixedVectorType>(Vec->getType());
572 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
573 ShufMask[NewIndex] = OldIndex;
574 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
575}
576
577/// Given an extract element instruction with constant index operand, shuffle
578/// the source vector (shift the scalar element) to a NewIndex for extraction.
579/// Return null if the input can be constant folded, so that we are not creating
580/// unnecessary instructions.
581static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
582 IRBuilderBase &Builder) {
583 // Shufflevectors can only be created for fixed-width vectors.
584 Value *X = ExtElt->getVectorOperand();
585 if (!isa<FixedVectorType>(X->getType()))
586 return nullptr;
587
588 // If the extract can be constant-folded, this code is unsimplified. Defer
589 // to other passes to handle that.
590 Value *C = ExtElt->getIndexOperand();
591 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
592 if (isa<Constant>(X))
593 return nullptr;
594
595 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
596 NewIndex, Builder);
597 return Shuf;
598}
599
600/// Try to reduce extract element costs by converting scalar compares to vector
601/// compares followed by extract.
602/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
603Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
604 Instruction &I) {
605 assert(isa<CmpInst>(&I) && "Expected a compare");
606
607 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
608 // --> extelt (cmp Pred V0, V1), ExtIndex
609 ++NumVecCmp;
610 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
611 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
612 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
613}
614
615/// Try to reduce extract element costs by converting scalar binops to vector
616/// binops followed by extract.
617/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
618Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
619 Instruction &I) {
620 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
621
622 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
623 // --> extelt (bo V0, V1), ExtIndex
624 ++NumVecBO;
625 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
626 V1, "foldExtExtBinop");
627
628 // All IR flags are safe to back-propagate because any potential poison
629 // created in unused vector elements is discarded by the extract.
630 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
631 VecBOInst->copyIRFlags(&I);
632
633 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
634}
635
636/// Match an instruction with extracted vector operands.
637bool VectorCombine::foldExtractExtract(Instruction &I) {
638 // It is not safe to transform things like div, urem, etc. because we may
639 // create undefined behavior when executing those on unknown vector elements.
641 return false;
642
643 Instruction *I0, *I1;
644 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
645 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
647 return false;
648
649 Value *V0, *V1;
650 uint64_t C0, C1;
651 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
652 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
653 V0->getType() != V1->getType())
654 return false;
655
656 // If the scalar value 'I' is going to be re-inserted into a vector, then try
657 // to create an extract to that same element. The extract/insert can be
658 // reduced to a "select shuffle".
659 // TODO: If we add a larger pattern match that starts from an insert, this
660 // probably becomes unnecessary.
661 auto *Ext0 = cast<ExtractElementInst>(I0);
662 auto *Ext1 = cast<ExtractElementInst>(I1);
663 uint64_t InsertIndex = InvalidIndex;
664 if (I.hasOneUse())
665 match(I.user_back(),
666 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
667
668 ExtractElementInst *ExtractToChange;
669 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
670 return false;
671
672 Value *ExtOp0 = Ext0->getVectorOperand();
673 Value *ExtOp1 = Ext1->getVectorOperand();
674
675 if (ExtractToChange) {
676 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
677 Value *NewExtOp =
678 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
679 if (!NewExtOp)
680 return false;
681 if (ExtractToChange == Ext0)
682 ExtOp0 = NewExtOp;
683 else
684 ExtOp1 = NewExtOp;
685 }
686
687 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
688 : Ext0->getIndexOperand();
689 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
690 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
691 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
692 Worklist.push(Ext0);
693 Worklist.push(Ext1);
694 replaceValue(I, *NewExt);
695 return true;
696}
697
698/// Try to replace an extract + scalar fneg + insert with a vector fneg +
699/// shuffle.
700bool VectorCombine::foldInsExtFNeg(Instruction &I) {
701 // Match an insert (op (extract)) pattern.
702 Value *DstVec;
703 uint64_t ExtIdx, InsIdx;
704 Instruction *FNeg;
705 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
706 m_ConstantInt(InsIdx))))
707 return false;
708
709 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
710 Value *SrcVec;
711 Instruction *Extract;
712 if (!match(FNeg, m_FNeg(m_CombineAnd(
713 m_Instruction(Extract),
714 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
715 return false;
716
717 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
718 auto *DstVecScalarTy = DstVecTy->getScalarType();
719 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
720 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
721 return false;
722
723 // Ignore if insert/extract index is out of bounds or destination vector has
724 // one element
725 unsigned NumDstElts = DstVecTy->getNumElements();
726 unsigned NumSrcElts = SrcVecTy->getNumElements();
727 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
728 return false;
729
730 // We are inserting the negated element into the same lane that we extracted
731 // from. This is equivalent to a select-shuffle that chooses all but the
732 // negated element from the destination vector.
733 SmallVector<int> Mask(NumDstElts);
734 std::iota(Mask.begin(), Mask.end(), 0);
735 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
736 InstructionCost OldCost =
737 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
738 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
739
740 // If the extract has one use, it will be eliminated, so count it in the
741 // original cost. If it has more than one use, ignore the cost because it will
742 // be the same before/after.
743 if (Extract->hasOneUse())
744 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
745
746 InstructionCost NewCost =
747 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
749 DstVecTy, Mask, CostKind);
750
751 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
752 // If the lengths of the two vectors are not equal,
753 // we need to add a length-change vector. Add this cost.
754 SmallVector<int> SrcMask;
755 if (NeedLenChg) {
756 SrcMask.assign(NumDstElts, PoisonMaskElem);
757 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
759 DstVecTy, SrcVecTy, SrcMask, CostKind);
760 }
761
762 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
763 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
764 << "\n");
765 if (NewCost > OldCost)
766 return false;
767
768 Value *NewShuf, *LenChgShuf = nullptr;
769 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
770 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
771 if (NeedLenChg) {
772 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
773 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
774 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
775 Worklist.pushValue(LenChgShuf);
776 } else {
777 // shuffle DstVec, (fneg SrcVec), Mask
778 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
779 }
780
781 Worklist.pushValue(VecFNeg);
782 replaceValue(I, *NewShuf);
783 return true;
784}
785
786/// Try to fold insert(binop(x,y),binop(a,b),idx)
787/// --> binop(insert(x,a,idx),insert(y,b,idx))
788bool VectorCombine::foldInsExtBinop(Instruction &I) {
789 BinaryOperator *VecBinOp, *SclBinOp;
790 uint64_t Index;
791 if (!match(&I,
792 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
793 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
794 return false;
795
796 // TODO: Add support for addlike etc.
797 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
798 if (BinOpcode != SclBinOp->getOpcode())
799 return false;
800
801 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
802 if (!ResultTy)
803 return false;
804
805 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
806 // shuffle?
807
809 TTI.getInstructionCost(VecBinOp, CostKind) +
811 InstructionCost NewCost =
812 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
813 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
814 Index, VecBinOp->getOperand(0),
815 SclBinOp->getOperand(0)) +
816 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
817 Index, VecBinOp->getOperand(1),
818 SclBinOp->getOperand(1));
819
820 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
821 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
822 << "\n");
823 if (NewCost > OldCost)
824 return false;
825
826 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
827 SclBinOp->getOperand(0), Index);
828 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
829 SclBinOp->getOperand(1), Index);
830 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
831
832 // Intersect flags from the old binops.
833 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
834 NewInst->copyIRFlags(VecBinOp);
835 NewInst->andIRFlags(SclBinOp);
836 }
837
838 Worklist.pushValue(NewIns0);
839 Worklist.pushValue(NewIns1);
840 replaceValue(I, *NewBO);
841 return true;
842}
843
844/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
845/// Supports: bitcast, trunc, sext, zext
846bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
847 // Check if this is a bitwise logic operation
848 auto *BinOp = dyn_cast<BinaryOperator>(&I);
849 if (!BinOp || !BinOp->isBitwiseLogicOp())
850 return false;
851
852 // Get the cast instructions
853 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
854 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
855 if (!LHSCast || !RHSCast) {
856 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
857 return false;
858 }
859
860 // Both casts must be the same type
861 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
862 if (CastOpcode != RHSCast->getOpcode())
863 return false;
864
865 // Only handle supported cast operations
866 switch (CastOpcode) {
867 case Instruction::BitCast:
868 case Instruction::Trunc:
869 case Instruction::SExt:
870 case Instruction::ZExt:
871 break;
872 default:
873 return false;
874 }
875
876 Value *LHSSrc = LHSCast->getOperand(0);
877 Value *RHSSrc = RHSCast->getOperand(0);
878
879 // Source types must match
880 if (LHSSrc->getType() != RHSSrc->getType())
881 return false;
882
883 auto *SrcTy = LHSSrc->getType();
884 auto *DstTy = I.getType();
885 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
886 // Other casts only handle vector types with integer elements.
887 if (CastOpcode != Instruction::BitCast &&
888 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
889 return false;
890
891 // Only integer scalar/vector values are legal for bitwise logic operations.
892 if (!SrcTy->getScalarType()->isIntegerTy() ||
893 !DstTy->getScalarType()->isIntegerTy())
894 return false;
895
896 // Cost Check :
897 // OldCost = bitlogic + 2*casts
898 // NewCost = bitlogic + cast
899
900 // Calculate specific costs for each cast with instruction context
902 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
904 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
905
906 InstructionCost OldCost =
907 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
908 LHSCastCost + RHSCastCost;
909
910 // For new cost, we can't provide an instruction (it doesn't exist yet)
911 InstructionCost GenericCastCost = TTI.getCastInstrCost(
912 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
913
914 InstructionCost NewCost =
915 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
916 GenericCastCost;
917
918 // Account for multi-use casts using specific costs
919 if (!LHSCast->hasOneUse())
920 NewCost += LHSCastCost;
921 if (!RHSCast->hasOneUse())
922 NewCost += RHSCastCost;
923
924 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
925 << " NewCost=" << NewCost << "\n");
926
927 if (NewCost > OldCost)
928 return false;
929
930 // Create the operation on the source type
931 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
932 BinOp->getName() + ".inner");
933 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
934 NewBinOp->copyIRFlags(BinOp);
935
936 Worklist.pushValue(NewOp);
937
938 // Create the cast operation directly to ensure we get a new instruction
939 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
940
941 // Preserve cast instruction flags
942 NewCast->copyIRFlags(LHSCast);
943 NewCast->andIRFlags(RHSCast);
944
945 // Insert the new instruction
946 Value *Result = Builder.Insert(NewCast);
947
948 replaceValue(I, *Result);
949 return true;
950}
951
952/// Match:
953// bitop(castop(x), C) ->
954// bitop(castop(x), castop(InvC)) ->
955// castop(bitop(x, InvC))
956// Supports: bitcast
957bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
959 Constant *C;
960
961 // Check if this is a bitwise logic operation
963 return false;
964
965 // Get the cast instructions
966 auto *LHSCast = dyn_cast<CastInst>(LHS);
967 if (!LHSCast)
968 return false;
969
970 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
971
972 // Only handle supported cast operations
973 switch (CastOpcode) {
974 case Instruction::BitCast:
975 case Instruction::ZExt:
976 case Instruction::SExt:
977 case Instruction::Trunc:
978 break;
979 default:
980 return false;
981 }
982
983 Value *LHSSrc = LHSCast->getOperand(0);
984
985 auto *SrcTy = LHSSrc->getType();
986 auto *DstTy = I.getType();
987 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
988 // Other casts only handle vector types with integer elements.
989 if (CastOpcode != Instruction::BitCast &&
990 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
991 return false;
992
993 // Only integer scalar/vector values are legal for bitwise logic operations.
994 if (!SrcTy->getScalarType()->isIntegerTy() ||
995 !DstTy->getScalarType()->isIntegerTy())
996 return false;
997
998 // Find the constant InvC, such that castop(InvC) equals to C.
999 PreservedCastFlags RHSFlags;
1000 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1001 if (!InvC)
1002 return false;
1003
1004 // Cost Check :
1005 // OldCost = bitlogic + cast
1006 // NewCost = bitlogic + cast
1007
1008 // Calculate specific costs for each cast with instruction context
1009 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1010 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1011
1012 InstructionCost OldCost =
1013 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1014
1015 // For new cost, we can't provide an instruction (it doesn't exist yet)
1016 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1017 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1018
1019 InstructionCost NewCost =
1020 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1021 GenericCastCost;
1022
1023 // Account for multi-use casts using specific costs
1024 if (!LHSCast->hasOneUse())
1025 NewCost += LHSCastCost;
1026
1027 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1028 << " NewCost=" << NewCost << "\n");
1029
1030 if (NewCost > OldCost)
1031 return false;
1032
1033 // Create the operation on the source type
1034 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1035 LHSSrc, InvC, I.getName() + ".inner");
1036 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1037 NewBinOp->copyIRFlags(&I);
1038
1039 Worklist.pushValue(NewOp);
1040
1041 // Create the cast operation directly to ensure we get a new instruction
1042 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1043
1044 // Preserve cast instruction flags
1045 if (RHSFlags.NNeg)
1046 NewCast->setNonNeg();
1047 if (RHSFlags.NUW)
1048 NewCast->setHasNoUnsignedWrap();
1049 if (RHSFlags.NSW)
1050 NewCast->setHasNoSignedWrap();
1051
1052 NewCast->andIRFlags(LHSCast);
1053
1054 // Insert the new instruction
1055 Value *Result = Builder.Insert(NewCast);
1056
1057 replaceValue(I, *Result);
1058 return true;
1059}
1060
1061/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1062/// destination type followed by shuffle. This can enable further transforms by
1063/// moving bitcasts or shuffles together.
1064bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1065 Value *V0, *V1;
1066 ArrayRef<int> Mask;
1067 if (!match(&I, m_BitCast(m_OneUse(
1068 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1069 return false;
1070
1071 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1072 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1073 // mask for scalable type is a splat or not.
1074 // 2) Disallow non-vector casts.
1075 // TODO: We could allow any shuffle.
1076 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1077 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1078 if (!DestTy || !SrcTy)
1079 return false;
1080
1081 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1082 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1083 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1084 return false;
1085
1086 bool IsUnary = isa<UndefValue>(V1);
1087
1088 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1089 // if it won't increase the number of bitcasts.
1090 if (!IsUnary) {
1093 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1094 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1095 return false;
1096 }
1097
1098 SmallVector<int, 16> NewMask;
1099 if (DestEltSize <= SrcEltSize) {
1100 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1101 // always be expanded to the equivalent form choosing narrower elements.
1102 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1103 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1104 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1105 } else {
1106 // The bitcast is from narrow elements to wide elements. The shuffle mask
1107 // must choose consecutive elements to allow casting first.
1108 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1109 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1110 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1111 return false;
1112 }
1113
1114 // Bitcast the shuffle src - keep its original width but using the destination
1115 // scalar type.
1116 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1117 auto *NewShuffleTy =
1118 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1119 auto *OldShuffleTy =
1120 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1121 unsigned NumOps = IsUnary ? 1 : 2;
1122
1123 // The new shuffle must not cost more than the old shuffle.
1127
1128 InstructionCost NewCost =
1129 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1130 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1131 TargetTransformInfo::CastContextHint::None,
1132 CostKind));
1133 InstructionCost OldCost =
1134 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1135 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1136 TargetTransformInfo::CastContextHint::None,
1137 CostKind);
1138
1139 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1140 << OldCost << " vs NewCost: " << NewCost << "\n");
1141
1142 if (NewCost > OldCost || !NewCost.isValid())
1143 return false;
1144
1145 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1146 ++NumShufOfBitcast;
1147 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1148 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1149 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1150 replaceValue(I, *Shuf);
1151 return true;
1152}
1153
1154/// VP Intrinsics whose vector operands are both splat values may be simplified
1155/// into the scalar version of the operation and the result splatted. This
1156/// can lead to scalarization down the line.
1157bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1158 if (!isa<VPIntrinsic>(I))
1159 return false;
1160 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1161 Value *Op0 = VPI.getArgOperand(0);
1162 Value *Op1 = VPI.getArgOperand(1);
1163
1164 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1165 return false;
1166
1167 // Check getSplatValue early in this function, to avoid doing unnecessary
1168 // work.
1169 Value *ScalarOp0 = getSplatValue(Op0);
1170 Value *ScalarOp1 = getSplatValue(Op1);
1171 if (!ScalarOp0 || !ScalarOp1)
1172 return false;
1173
1174 // For the binary VP intrinsics supported here, the result on disabled lanes
1175 // is a poison value. For now, only do this simplification if all lanes
1176 // are active.
1177 // TODO: Relax the condition that all lanes are active by using insertelement
1178 // on inactive lanes.
1179 auto IsAllTrueMask = [](Value *MaskVal) {
1180 if (Value *SplattedVal = getSplatValue(MaskVal))
1181 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1182 return ConstValue->isAllOnesValue();
1183 return false;
1184 };
1185 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1186 return false;
1187
1188 // Check to make sure we support scalarization of the intrinsic
1189 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1190 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1191 return false;
1192
1193 // Calculate cost of splatting both operands into vectors and the vector
1194 // intrinsic
1195 VectorType *VecTy = cast<VectorType>(VPI.getType());
1196 SmallVector<int> Mask;
1197 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1198 Mask.resize(FVTy->getNumElements(), 0);
1199 InstructionCost SplatCost =
1200 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1202 CostKind);
1203
1204 // Calculate the cost of the VP Intrinsic
1206 for (Value *V : VPI.args())
1207 Args.push_back(V->getType());
1208 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1209 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1210 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1211
1212 // Determine scalar opcode
1213 std::optional<unsigned> FunctionalOpcode =
1214 VPI.getFunctionalOpcode();
1215 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1216 if (!FunctionalOpcode) {
1217 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1218 if (!ScalarIntrID)
1219 return false;
1220 }
1221
1222 // Calculate cost of scalarizing
1223 InstructionCost ScalarOpCost = 0;
1224 if (ScalarIntrID) {
1225 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1226 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1227 } else {
1228 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1229 VecTy->getScalarType(), CostKind);
1230 }
1231
1232 // The existing splats may be kept around if other instructions use them.
1233 InstructionCost CostToKeepSplats =
1234 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1235 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1236
1237 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1238 << "\n");
1239 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1240 << ", Cost of scalarizing:" << NewCost << "\n");
1241
1242 // We want to scalarize unless the vector variant actually has lower cost.
1243 if (OldCost < NewCost || !NewCost.isValid())
1244 return false;
1245
1246 // Scalarize the intrinsic
1247 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1248 Value *EVL = VPI.getArgOperand(3);
1249
1250 // If the VP op might introduce UB or poison, we can scalarize it provided
1251 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1252 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1253 // scalarizing it.
1254 bool SafeToSpeculate;
1255 if (ScalarIntrID)
1256 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1257 .hasAttribute(Attribute::AttrKind::Speculatable);
1258 else
1260 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1261 if (!SafeToSpeculate &&
1262 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1263 return false;
1264
1265 Value *ScalarVal =
1266 ScalarIntrID
1267 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1268 {ScalarOp0, ScalarOp1})
1269 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1270 ScalarOp0, ScalarOp1);
1271
1272 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1273 return true;
1274}
1275
1276/// Match a vector op/compare/intrinsic with at least one
1277/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1278/// by insertelement.
1279bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1280 auto *UO = dyn_cast<UnaryOperator>(&I);
1281 auto *BO = dyn_cast<BinaryOperator>(&I);
1282 auto *CI = dyn_cast<CmpInst>(&I);
1283 auto *II = dyn_cast<IntrinsicInst>(&I);
1284 if (!UO && !BO && !CI && !II)
1285 return false;
1286
1287 // TODO: Allow intrinsics with different argument types
1288 if (II) {
1289 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1290 return false;
1291 for (auto [Idx, Arg] : enumerate(II->args()))
1292 if (Arg->getType() != II->getType() &&
1293 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1294 return false;
1295 }
1296
1297 // Do not convert the vector condition of a vector select into a scalar
1298 // condition. That may cause problems for codegen because of differences in
1299 // boolean formats and register-file transfers.
1300 // TODO: Can we account for that in the cost model?
1301 if (CI)
1302 for (User *U : I.users())
1303 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1304 return false;
1305
1306 // Match constant vectors or scalars being inserted into constant vectors:
1307 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1308 SmallVector<Value *> VecCs, ScalarOps;
1309 std::optional<uint64_t> Index;
1310
1311 auto Ops = II ? II->args() : I.operands();
1312 for (auto [OpNum, Op] : enumerate(Ops)) {
1313 Constant *VecC;
1314 Value *V;
1315 uint64_t InsIdx = 0;
1316 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1317 m_ConstantInt(InsIdx)))) {
1318 // Bail if any inserts are out of bounds.
1319 VectorType *OpTy = cast<VectorType>(Op->getType());
1320 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1321 return false;
1322 // All inserts must have the same index.
1323 // TODO: Deal with mismatched index constants and variable indexes?
1324 if (!Index)
1325 Index = InsIdx;
1326 else if (InsIdx != *Index)
1327 return false;
1328 VecCs.push_back(VecC);
1329 ScalarOps.push_back(V);
1330 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1331 OpNum, &TTI)) {
1332 VecCs.push_back(Op.get());
1333 ScalarOps.push_back(Op.get());
1334 } else if (match(Op.get(), m_Constant(VecC))) {
1335 VecCs.push_back(VecC);
1336 ScalarOps.push_back(nullptr);
1337 } else {
1338 return false;
1339 }
1340 }
1341
1342 // Bail if all operands are constant.
1343 if (!Index.has_value())
1344 return false;
1345
1346 VectorType *VecTy = cast<VectorType>(I.getType());
1347 Type *ScalarTy = VecTy->getScalarType();
1348 assert(VecTy->isVectorTy() &&
1349 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1350 ScalarTy->isPointerTy()) &&
1351 "Unexpected types for insert element into binop or cmp");
1352
1353 unsigned Opcode = I.getOpcode();
1354 InstructionCost ScalarOpCost, VectorOpCost;
1355 if (CI) {
1356 CmpInst::Predicate Pred = CI->getPredicate();
1357 ScalarOpCost = TTI.getCmpSelInstrCost(
1358 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1359 VectorOpCost = TTI.getCmpSelInstrCost(
1360 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1361 } else if (UO || BO) {
1362 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1363 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1364 } else {
1365 IntrinsicCostAttributes ScalarICA(
1366 II->getIntrinsicID(), ScalarTy,
1367 SmallVector<Type *>(II->arg_size(), ScalarTy));
1368 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1369 IntrinsicCostAttributes VectorICA(
1370 II->getIntrinsicID(), VecTy,
1371 SmallVector<Type *>(II->arg_size(), VecTy));
1372 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1373 }
1374
1375 // Fold the vector constants in the original vectors into a new base vector to
1376 // get more accurate cost modelling.
1377 Value *NewVecC = nullptr;
1378 if (CI)
1379 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1380 else if (UO)
1381 NewVecC =
1382 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1383 else if (BO)
1384 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1385 else if (II)
1386 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1387
1388 if (!NewVecC)
1389 return false;
1390
1391 // Get cost estimate for the insert element. This cost will factor into
1392 // both sequences.
1393 InstructionCost OldCost = VectorOpCost;
1394 InstructionCost NewCost =
1395 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1396 CostKind, *Index, NewVecC);
1397
1398 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1399 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1400 II->getIntrinsicID(), Idx, &TTI)))
1401 continue;
1403 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1404 OldCost += InsertCost;
1405 NewCost += !Op->hasOneUse() * InsertCost;
1406 }
1407
1408 // We want to scalarize unless the vector variant actually has lower cost.
1409 if (OldCost < NewCost || !NewCost.isValid())
1410 return false;
1411
1412 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1413 // inselt NewVecC, (scalar_op V0, V1), Index
1414 if (CI)
1415 ++NumScalarCmp;
1416 else if (UO || BO)
1417 ++NumScalarOps;
1418 else
1419 ++NumScalarIntrinsic;
1420
1421 // For constant cases, extract the scalar element, this should constant fold.
1422 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1423 if (!Scalar)
1425 cast<Constant>(VecC), Builder.getInt64(*Index));
1426
1427 Value *Scalar;
1428 if (CI)
1429 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1430 else if (UO || BO)
1431 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1432 else
1433 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1434
1435 Scalar->setName(I.getName() + ".scalar");
1436
1437 // All IR flags are safe to back-propagate. There is no potential for extra
1438 // poison to be created by the scalar instruction.
1439 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1440 ScalarInst->copyIRFlags(&I);
1441
1442 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1443 replaceValue(I, *Insert);
1444 return true;
1445}
1446
1447/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1448/// a vector into vector operations followed by extract. Note: The SLP pass
1449/// may miss this pattern because of implementation problems.
1450bool VectorCombine::foldExtractedCmps(Instruction &I) {
1451 auto *BI = dyn_cast<BinaryOperator>(&I);
1452
1453 // We are looking for a scalar binop of booleans.
1454 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1455 if (!BI || !I.getType()->isIntegerTy(1))
1456 return false;
1457
1458 // The compare predicates should match, and each compare should have a
1459 // constant operand.
1460 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1461 Instruction *I0, *I1;
1462 Constant *C0, *C1;
1463 CmpPredicate P0, P1;
1464 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1465 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1466 return false;
1467
1468 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1469 if (!MatchingPred)
1470 return false;
1471
1472 // The compare operands must be extracts of the same vector with constant
1473 // extract indexes.
1474 Value *X;
1475 uint64_t Index0, Index1;
1476 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1477 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1478 return false;
1479
1480 auto *Ext0 = cast<ExtractElementInst>(I0);
1481 auto *Ext1 = cast<ExtractElementInst>(I1);
1482 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1483 if (!ConvertToShuf)
1484 return false;
1485 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1486 "Unknown ExtractElementInst");
1487
1488 // The original scalar pattern is:
1489 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1490 CmpInst::Predicate Pred = *MatchingPred;
1491 unsigned CmpOpcode =
1492 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1493 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1494 if (!VecTy)
1495 return false;
1496
1497 InstructionCost Ext0Cost =
1498 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1499 InstructionCost Ext1Cost =
1500 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1502 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1503 CostKind);
1504
1505 InstructionCost OldCost =
1506 Ext0Cost + Ext1Cost + CmpCost * 2 +
1507 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1508
1509 // The proposed vector pattern is:
1510 // vcmp = cmp Pred X, VecC
1511 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1512 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1513 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1516 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1517 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1518 ShufMask[CheapIndex] = ExpensiveIndex;
1520 CmpTy, ShufMask, CostKind);
1521 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1522 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1523 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1524 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1525
1526 // Aggressively form vector ops if the cost is equal because the transform
1527 // may enable further optimization.
1528 // Codegen can reverse this transform (scalarize) if it was not profitable.
1529 if (OldCost < NewCost || !NewCost.isValid())
1530 return false;
1531
1532 // Create a vector constant from the 2 scalar constants.
1533 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1534 PoisonValue::get(VecTy->getElementType()));
1535 CmpC[Index0] = C0;
1536 CmpC[Index1] = C1;
1537 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1538 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1539 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1540 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1541 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1542 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1543 replaceValue(I, *NewExt);
1544 ++NumVecCmpBO;
1545 return true;
1546}
1547
1550 const TargetTransformInfo &TTI,
1551 InstructionCost &CostBeforeReduction,
1552 InstructionCost &CostAfterReduction) {
1553 Instruction *Op0, *Op1;
1554 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1555 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1556 unsigned ReductionOpc =
1557 getArithmeticReductionInstruction(II.getIntrinsicID());
1558 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1559 bool IsUnsigned = isa<ZExtInst>(RedOp);
1560 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1561
1562 CostBeforeReduction =
1563 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1565 CostAfterReduction =
1566 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1567 ExtType, FastMathFlags(), CostKind);
1568 return;
1569 }
1570 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1571 match(RedOp,
1573 match(Op0, m_ZExtOrSExt(m_Value())) &&
1574 Op0->getOpcode() == Op1->getOpcode() &&
1575 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1576 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1577 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1578 bool IsUnsigned = isa<ZExtInst>(Op0);
1579 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1580 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1581
1582 InstructionCost ExtCost =
1583 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1585 InstructionCost MulCost =
1586 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1587 InstructionCost Ext2Cost =
1588 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1590
1591 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1592 CostAfterReduction = TTI.getMulAccReductionCost(
1593 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1594 return;
1595 }
1596 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1597 std::nullopt, CostKind);
1598}
1599
1600bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1601 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1602 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1603 if (BinOpOpc == Instruction::Sub)
1604 ReductionIID = Intrinsic::vector_reduce_add;
1605 if (ReductionIID == Intrinsic::not_intrinsic)
1606 return false;
1607
1608 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1609 Intrinsic::ID IID) -> Value * {
1610 auto *II = dyn_cast<IntrinsicInst>(V);
1611 if (!II)
1612 return nullptr;
1613 if (II->getIntrinsicID() == IID && II->hasOneUse())
1614 return II->getArgOperand(0);
1615 return nullptr;
1616 };
1617
1618 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1619 if (!V0)
1620 return false;
1621 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1622 if (!V1)
1623 return false;
1624
1625 auto *VTy = cast<VectorType>(V0->getType());
1626 if (V1->getType() != VTy)
1627 return false;
1628 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1629 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1630 unsigned ReductionOpc =
1631 getArithmeticReductionInstruction(II0.getIntrinsicID());
1632
1633 InstructionCost OldCost = 0;
1634 InstructionCost NewCost = 0;
1635 InstructionCost CostOfRedOperand0 = 0;
1636 InstructionCost CostOfRed0 = 0;
1637 InstructionCost CostOfRedOperand1 = 0;
1638 InstructionCost CostOfRed1 = 0;
1639 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1640 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1641 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1642 NewCost =
1643 CostOfRedOperand0 + CostOfRedOperand1 +
1644 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1645 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1646 if (NewCost >= OldCost || !NewCost.isValid())
1647 return false;
1648
1649 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1650 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1651 << "\n");
1652 Value *VectorBO;
1653 if (BinOpOpc == Instruction::Or)
1654 VectorBO = Builder.CreateOr(V0, V1, "",
1655 cast<PossiblyDisjointInst>(I).isDisjoint());
1656 else
1657 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1658
1659 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1660 replaceValue(I, *Rdx);
1661 return true;
1662}
1663
1664// Check if memory loc modified between two instrs in the same BB
1667 const MemoryLocation &Loc, AAResults &AA) {
1668 unsigned NumScanned = 0;
1669 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1670 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1671 ++NumScanned > MaxInstrsToScan;
1672 });
1673}
1674
1675namespace {
1676/// Helper class to indicate whether a vector index can be safely scalarized and
1677/// if a freeze needs to be inserted.
1678class ScalarizationResult {
1679 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1680
1681 StatusTy Status;
1682 Value *ToFreeze;
1683
1684 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1685 : Status(Status), ToFreeze(ToFreeze) {}
1686
1687public:
1688 ScalarizationResult(const ScalarizationResult &Other) = default;
1689 ~ScalarizationResult() {
1690 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1691 }
1692
1693 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1694 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1695 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1696 return {StatusTy::SafeWithFreeze, ToFreeze};
1697 }
1698
1699 /// Returns true if the index can be scalarize without requiring a freeze.
1700 bool isSafe() const { return Status == StatusTy::Safe; }
1701 /// Returns true if the index cannot be scalarized.
1702 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1703 /// Returns true if the index can be scalarize, but requires inserting a
1704 /// freeze.
1705 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1706
1707 /// Reset the state of Unsafe and clear ToFreze if set.
1708 void discard() {
1709 ToFreeze = nullptr;
1710 Status = StatusTy::Unsafe;
1711 }
1712
1713 /// Freeze the ToFreeze and update the use in \p User to use it.
1714 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1715 assert(isSafeWithFreeze() &&
1716 "should only be used when freezing is required");
1717 assert(is_contained(ToFreeze->users(), &UserI) &&
1718 "UserI must be a user of ToFreeze");
1719 IRBuilder<>::InsertPointGuard Guard(Builder);
1720 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1721 Value *Frozen =
1722 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1723 for (Use &U : make_early_inc_range((UserI.operands())))
1724 if (U.get() == ToFreeze)
1725 U.set(Frozen);
1726
1727 ToFreeze = nullptr;
1728 }
1729};
1730} // namespace
1731
1732/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1733/// Idx. \p Idx must access a valid vector element.
1734static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1735 Instruction *CtxI,
1736 AssumptionCache &AC,
1737 const DominatorTree &DT) {
1738 // We do checks for both fixed vector types and scalable vector types.
1739 // This is the number of elements of fixed vector types,
1740 // or the minimum number of elements of scalable vector types.
1741 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1742 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1743
1744 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1745 if (C->getValue().ult(NumElements))
1746 return ScalarizationResult::safe();
1747 return ScalarizationResult::unsafe();
1748 }
1749
1750 // Always unsafe if the index type can't handle all inbound values.
1751 if (!llvm::isUIntN(IntWidth, NumElements))
1752 return ScalarizationResult::unsafe();
1753
1754 APInt Zero(IntWidth, 0);
1755 APInt MaxElts(IntWidth, NumElements);
1756 ConstantRange ValidIndices(Zero, MaxElts);
1757 ConstantRange IdxRange(IntWidth, true);
1758
1759 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1760 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1761 true, &AC, CtxI, &DT)))
1762 return ScalarizationResult::safe();
1763 return ScalarizationResult::unsafe();
1764 }
1765
1766 // If the index may be poison, check if we can insert a freeze before the
1767 // range of the index is restricted.
1768 Value *IdxBase;
1769 ConstantInt *CI;
1770 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1771 IdxRange = IdxRange.binaryAnd(CI->getValue());
1772 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1773 IdxRange = IdxRange.urem(CI->getValue());
1774 }
1775
1776 if (ValidIndices.contains(IdxRange))
1777 return ScalarizationResult::safeWithFreeze(IdxBase);
1778 return ScalarizationResult::unsafe();
1779}
1780
1781/// The memory operation on a vector of \p ScalarType had alignment of
1782/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1783/// alignment that will be valid for the memory operation on a single scalar
1784/// element of the same type with index \p Idx.
1786 Type *ScalarType, Value *Idx,
1787 const DataLayout &DL) {
1788 if (auto *C = dyn_cast<ConstantInt>(Idx))
1789 return commonAlignment(VectorAlignment,
1790 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1791 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1792}
1793
1794// Combine patterns like:
1795// %0 = load <4 x i32>, <4 x i32>* %a
1796// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1797// store <4 x i32> %1, <4 x i32>* %a
1798// to:
1799// %0 = bitcast <4 x i32>* %a to i32*
1800// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1801// store i32 %b, i32* %1
1802bool VectorCombine::foldSingleElementStore(Instruction &I) {
1804 return false;
1805 auto *SI = cast<StoreInst>(&I);
1806 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1807 return false;
1808
1809 // TODO: Combine more complicated patterns (multiple insert) by referencing
1810 // TargetTransformInfo.
1812 Value *NewElement;
1813 Value *Idx;
1814 if (!match(SI->getValueOperand(),
1815 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1816 m_Value(Idx))))
1817 return false;
1818
1819 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1820 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1821 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1822 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1823 // modified between, vector type matches store size, and index is inbounds.
1824 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1825 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1826 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1827 return false;
1828
1829 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1830 if (ScalarizableIdx.isUnsafe() ||
1831 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1832 MemoryLocation::get(SI), AA))
1833 return false;
1834
1835 // Ensure we add the load back to the worklist BEFORE its users so they can
1836 // erased in the correct order.
1837 Worklist.push(Load);
1838
1839 if (ScalarizableIdx.isSafeWithFreeze())
1840 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1841 Value *GEP = Builder.CreateInBoundsGEP(
1842 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1843 {ConstantInt::get(Idx->getType(), 0), Idx});
1844 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1845 NSI->copyMetadata(*SI);
1846 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1847 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1848 *DL);
1849 NSI->setAlignment(ScalarOpAlignment);
1850 replaceValue(I, *NSI);
1852 return true;
1853 }
1854
1855 return false;
1856}
1857
1858/// Try to scalarize vector loads feeding extractelement or bitcast
1859/// instructions.
1860bool VectorCombine::scalarizeLoad(Instruction &I) {
1861 Value *Ptr;
1862 if (!match(&I, m_Load(m_Value(Ptr))))
1863 return false;
1864
1865 auto *LI = cast<LoadInst>(&I);
1866 auto *VecTy = cast<VectorType>(LI->getType());
1867 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1868 return false;
1869
1870 bool AllExtracts = true;
1871 bool AllBitcasts = true;
1872 Instruction *LastCheckedInst = LI;
1873 unsigned NumInstChecked = 0;
1874
1875 // Check what type of users we have (must either all be extracts or
1876 // bitcasts) and ensure no memory modifications between the load and
1877 // its users.
1878 for (User *U : LI->users()) {
1879 auto *UI = dyn_cast<Instruction>(U);
1880 if (!UI || UI->getParent() != LI->getParent())
1881 return false;
1882
1883 // If any user is waiting to be erased, then bail out as this will
1884 // distort the cost calculation and possibly lead to infinite loops.
1885 if (UI->use_empty())
1886 return false;
1887
1888 if (!isa<ExtractElementInst>(UI))
1889 AllExtracts = false;
1890 if (!isa<BitCastInst>(UI))
1891 AllBitcasts = false;
1892
1893 // Check if any instruction between the load and the user may modify memory.
1894 if (LastCheckedInst->comesBefore(UI)) {
1895 for (Instruction &I :
1896 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1897 // Bail out if we reached the check limit or the instruction may write
1898 // to memory.
1899 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1900 return false;
1901 NumInstChecked++;
1902 }
1903 LastCheckedInst = UI;
1904 }
1905 }
1906
1907 if (AllExtracts)
1908 return scalarizeLoadExtract(LI, VecTy, Ptr);
1909 if (AllBitcasts)
1910 return scalarizeLoadBitcast(LI, VecTy, Ptr);
1911 return false;
1912}
1913
1914/// Try to scalarize vector loads feeding extractelement instructions.
1915bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
1916 Value *Ptr) {
1918 return false;
1919
1920 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1921 auto FailureGuard = make_scope_exit([&]() {
1922 // If the transform is aborted, discard the ScalarizationResults.
1923 for (auto &Pair : NeedFreeze)
1924 Pair.second.discard();
1925 });
1926
1927 InstructionCost OriginalCost =
1928 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1930 InstructionCost ScalarizedCost = 0;
1931
1932 for (User *U : LI->users()) {
1933 auto *UI = cast<ExtractElementInst>(U);
1934
1935 auto ScalarIdx =
1936 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1937 if (ScalarIdx.isUnsafe())
1938 return false;
1939 if (ScalarIdx.isSafeWithFreeze()) {
1940 NeedFreeze.try_emplace(UI, ScalarIdx);
1941 ScalarIdx.discard();
1942 }
1943
1944 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1945 OriginalCost +=
1946 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1947 Index ? Index->getZExtValue() : -1);
1948 ScalarizedCost +=
1949 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1951 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1952 nullptr, nullptr, CostKind);
1953 }
1954
1955 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
1956 << "\n LoadExtractCost: " << OriginalCost
1957 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1958
1959 if (ScalarizedCost >= OriginalCost)
1960 return false;
1961
1962 // Ensure we add the load back to the worklist BEFORE its users so they can
1963 // erased in the correct order.
1964 Worklist.push(LI);
1965
1966 Type *ElemType = VecTy->getElementType();
1967
1968 // Replace extracts with narrow scalar loads.
1969 for (User *U : LI->users()) {
1970 auto *EI = cast<ExtractElementInst>(U);
1971 Value *Idx = EI->getIndexOperand();
1972
1973 // Insert 'freeze' for poison indexes.
1974 auto It = NeedFreeze.find(EI);
1975 if (It != NeedFreeze.end())
1976 It->second.freeze(Builder, *cast<Instruction>(Idx));
1977
1978 Builder.SetInsertPoint(EI);
1979 Value *GEP =
1980 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1981 auto *NewLoad = cast<LoadInst>(
1982 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1983
1984 Align ScalarOpAlignment =
1985 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1986 NewLoad->setAlignment(ScalarOpAlignment);
1987
1988 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1989 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1990 AAMDNodes OldAAMD = LI->getAAMetadata();
1991 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1992 }
1993
1994 replaceValue(*EI, *NewLoad, false);
1995 }
1996
1997 FailureGuard.release();
1998 return true;
1999}
2000
2001/// Try to scalarize vector loads feeding bitcast instructions.
2002bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2003 Value *Ptr) {
2004 InstructionCost OriginalCost =
2005 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2007
2008 Type *TargetScalarType = nullptr;
2009 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2010
2011 for (User *U : LI->users()) {
2012 auto *BC = cast<BitCastInst>(U);
2013
2014 Type *DestTy = BC->getDestTy();
2015 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2016 return false;
2017
2018 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2019 if (DestBitWidth != VecBitWidth)
2020 return false;
2021
2022 // All bitcasts must target the same scalar type.
2023 if (!TargetScalarType)
2024 TargetScalarType = DestTy;
2025 else if (TargetScalarType != DestTy)
2026 return false;
2027
2028 OriginalCost +=
2029 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2031 }
2032
2033 if (!TargetScalarType)
2034 return false;
2035
2036 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2037 InstructionCost ScalarizedCost =
2038 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2040
2041 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2042 << "\n OriginalCost: " << OriginalCost
2043 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2044
2045 if (ScalarizedCost >= OriginalCost)
2046 return false;
2047
2048 // Ensure we add the load back to the worklist BEFORE its users so they can
2049 // erased in the correct order.
2050 Worklist.push(LI);
2051
2052 Builder.SetInsertPoint(LI);
2053 auto *ScalarLoad =
2054 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2055 ScalarLoad->setAlignment(LI->getAlign());
2056 ScalarLoad->copyMetadata(*LI);
2057
2058 // Replace all bitcast users with the scalar load.
2059 for (User *U : LI->users()) {
2060 auto *BC = cast<BitCastInst>(U);
2061 replaceValue(*BC, *ScalarLoad, false);
2062 }
2063
2064 return true;
2065}
2066
2067bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2069 return false;
2070 auto *Ext = dyn_cast<ZExtInst>(&I);
2071 if (!Ext)
2072 return false;
2073
2074 // Try to convert a vector zext feeding only extracts to a set of scalar
2075 // (Src << ExtIdx *Size) & (Size -1)
2076 // if profitable .
2077 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2078 if (!SrcTy)
2079 return false;
2080 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2081
2082 Type *ScalarDstTy = DstTy->getElementType();
2083 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2084 return false;
2085
2086 InstructionCost VectorCost =
2087 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2089 unsigned ExtCnt = 0;
2090 bool ExtLane0 = false;
2091 for (User *U : Ext->users()) {
2092 uint64_t Idx;
2093 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2094 return false;
2095 if (cast<Instruction>(U)->use_empty())
2096 continue;
2097 ExtCnt += 1;
2098 ExtLane0 |= !Idx;
2099 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2100 CostKind, Idx, U);
2101 }
2102
2103 InstructionCost ScalarCost =
2104 ExtCnt * TTI.getArithmeticInstrCost(
2105 Instruction::And, ScalarDstTy, CostKind,
2108 (ExtCnt - ExtLane0) *
2110 Instruction::LShr, ScalarDstTy, CostKind,
2113 if (ScalarCost > VectorCost)
2114 return false;
2115
2116 Value *ScalarV = Ext->getOperand(0);
2117 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2118 &DT)) {
2119 // Check wether all lanes are extracted, all extracts trigger UB
2120 // on poison, and the last extract (and hence all previous ones)
2121 // are guaranteed to execute if Ext executes. If so, we do not
2122 // need to insert a freeze.
2123 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2124 bool AllExtractsTriggerUB = true;
2125 ExtractElementInst *LastExtract = nullptr;
2126 BasicBlock *ExtBB = Ext->getParent();
2127 for (User *U : Ext->users()) {
2128 auto *Extract = cast<ExtractElementInst>(U);
2129 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2130 AllExtractsTriggerUB = false;
2131 break;
2132 }
2133 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2134 if (!LastExtract || LastExtract->comesBefore(Extract))
2135 LastExtract = Extract;
2136 }
2137 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2138 !AllExtractsTriggerUB ||
2140 LastExtract->getIterator()))
2141 ScalarV = Builder.CreateFreeze(ScalarV);
2142 }
2143 ScalarV = Builder.CreateBitCast(
2144 ScalarV,
2145 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2146 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2147 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2148 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2149 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2150 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2151 for (User *U : Ext->users()) {
2152 auto *Extract = cast<ExtractElementInst>(U);
2153 uint64_t Idx =
2154 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2155 uint64_t ShiftAmt =
2156 DL->isBigEndian()
2157 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2158 : (Idx * SrcEltSizeInBits);
2159 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2160 Value *And = Builder.CreateAnd(LShr, Mask);
2161 U->replaceAllUsesWith(And);
2162 }
2163 return true;
2164}
2165
2166/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2167/// to "(bitcast (concat X, Y))"
2168/// where X/Y are bitcasted from i1 mask vectors.
2169bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2170 Type *Ty = I.getType();
2171 if (!Ty->isIntegerTy())
2172 return false;
2173
2174 // TODO: Add big endian test coverage
2175 if (DL->isBigEndian())
2176 return false;
2177
2178 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2179 Instruction *X, *Y;
2181 return false;
2182
2183 // Allow both sources to contain shl, to handle more generic pattern:
2184 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2185 Value *SrcX;
2186 uint64_t ShAmtX = 0;
2187 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2188 !match(X, m_OneUse(
2190 m_ConstantInt(ShAmtX)))))
2191 return false;
2192
2193 Value *SrcY;
2194 uint64_t ShAmtY = 0;
2195 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2196 !match(Y, m_OneUse(
2198 m_ConstantInt(ShAmtY)))))
2199 return false;
2200
2201 // Canonicalize larger shift to the RHS.
2202 if (ShAmtX > ShAmtY) {
2203 std::swap(X, Y);
2204 std::swap(SrcX, SrcY);
2205 std::swap(ShAmtX, ShAmtY);
2206 }
2207
2208 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2209 // difference is the mask width so they can be easily concatenated together.
2210 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2211 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2212 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2213 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2214 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2215 !MaskTy->getElementType()->isIntegerTy(1) ||
2216 MaskTy->getNumElements() != ShAmtDiff ||
2217 MaskTy->getNumElements() > (BitWidth / 2))
2218 return false;
2219
2220 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2221 auto *ConcatIntTy =
2222 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2223 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2224
2225 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2226 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2227
2228 // TODO: Is it worth supporting multi use cases?
2229 InstructionCost OldCost = 0;
2230 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2231 OldCost +=
2232 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2233 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2235 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2237
2238 InstructionCost NewCost = 0;
2240 MaskTy, ConcatMask, CostKind);
2241 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2243 if (Ty != ConcatIntTy)
2244 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2246 if (ShAmtX > 0)
2247 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2248
2249 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2250 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2251 << "\n");
2252
2253 if (NewCost > OldCost)
2254 return false;
2255
2256 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2257 // any residual zero-extension or shifting.
2258 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2259 Worklist.pushValue(Concat);
2260
2261 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2262
2263 if (Ty != ConcatIntTy) {
2264 Worklist.pushValue(Result);
2265 Result = Builder.CreateZExt(Result, Ty);
2266 }
2267
2268 if (ShAmtX > 0) {
2269 Worklist.pushValue(Result);
2270 Result = Builder.CreateShl(Result, ShAmtX);
2271 }
2272
2273 replaceValue(I, *Result);
2274 return true;
2275}
2276
2277/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2278/// --> "binop (shuffle), (shuffle)".
2279bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2280 BinaryOperator *BinOp;
2281 ArrayRef<int> OuterMask;
2282 if (!match(&I,
2283 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2284 return false;
2285
2286 // Don't introduce poison into div/rem.
2287 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2288 return false;
2289
2290 Value *Op00, *Op01, *Op10, *Op11;
2291 ArrayRef<int> Mask0, Mask1;
2292 bool Match0 =
2293 match(BinOp->getOperand(0),
2294 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2295 bool Match1 =
2296 match(BinOp->getOperand(1),
2297 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2298 if (!Match0 && !Match1)
2299 return false;
2300
2301 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2302 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2303 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2304 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2305
2306 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2307 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2308 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2309 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2310 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2311 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2312 return false;
2313
2314 unsigned NumSrcElts = BinOpTy->getNumElements();
2315
2316 // Don't accept shuffles that reference the second operand in
2317 // div/rem or if its an undef arg.
2318 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2319 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2320 return false;
2321
2322 // Merge outer / inner (or identity if no match) shuffles.
2323 SmallVector<int> NewMask0, NewMask1;
2324 for (int M : OuterMask) {
2325 if (M < 0 || M >= (int)NumSrcElts) {
2326 NewMask0.push_back(PoisonMaskElem);
2327 NewMask1.push_back(PoisonMaskElem);
2328 } else {
2329 NewMask0.push_back(Match0 ? Mask0[M] : M);
2330 NewMask1.push_back(Match1 ? Mask1[M] : M);
2331 }
2332 }
2333
2334 unsigned NumOpElts = Op0Ty->getNumElements();
2335 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2336 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2337 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2338 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2339 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2340 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2341
2342 // Try to merge shuffles across the binop if the new shuffles are not costly.
2343 InstructionCost OldCost =
2344 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2346 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2347 if (Match0)
2348 OldCost += TTI.getShuffleCost(
2349 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2350 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2351 if (Match1)
2352 OldCost += TTI.getShuffleCost(
2353 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2354 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2355
2356 InstructionCost NewCost =
2357 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2358
2359 if (!IsIdentity0)
2360 NewCost +=
2362 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2363 if (!IsIdentity1)
2364 NewCost +=
2366 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2367
2368 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2369 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2370 << "\n");
2371
2372 // If costs are equal, still fold as we reduce instruction count.
2373 if (NewCost > OldCost)
2374 return false;
2375
2376 Value *LHS =
2377 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2378 Value *RHS =
2379 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2380 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2381
2382 // Intersect flags from the old binops.
2383 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2384 NewInst->copyIRFlags(BinOp);
2385
2386 Worklist.pushValue(LHS);
2387 Worklist.pushValue(RHS);
2388 replaceValue(I, *NewBO);
2389 return true;
2390}
2391
2392/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2393/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2394bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2395 ArrayRef<int> OldMask;
2396 Instruction *LHS, *RHS;
2398 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2399 return false;
2400
2401 // TODO: Add support for addlike etc.
2402 if (LHS->getOpcode() != RHS->getOpcode())
2403 return false;
2404
2405 Value *X, *Y, *Z, *W;
2406 bool IsCommutative = false;
2407 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2408 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2409 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2410 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2411 auto *BO = cast<BinaryOperator>(LHS);
2412 // Don't introduce poison into div/rem.
2413 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2414 return false;
2415 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2416 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2417 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2418 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2419 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2420 } else
2421 return false;
2422
2423 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2424 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2425 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2426 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2427 return false;
2428
2429 unsigned NumSrcElts = BinOpTy->getNumElements();
2430
2431 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2432 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2433 std::swap(X, Y);
2434
2435 auto ConvertToUnary = [NumSrcElts](int &M) {
2436 if (M >= (int)NumSrcElts)
2437 M -= NumSrcElts;
2438 };
2439
2440 SmallVector<int> NewMask0(OldMask);
2442 if (X == Z) {
2443 llvm::for_each(NewMask0, ConvertToUnary);
2445 Z = PoisonValue::get(BinOpTy);
2446 }
2447
2448 SmallVector<int> NewMask1(OldMask);
2450 if (Y == W) {
2451 llvm::for_each(NewMask1, ConvertToUnary);
2453 W = PoisonValue::get(BinOpTy);
2454 }
2455
2456 // Try to replace a binop with a shuffle if the shuffle is not costly.
2457 InstructionCost OldCost =
2461 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2462 &I);
2463
2464 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2465 // where one use shuffles have gotten split across the binop/cmp. These
2466 // often allow a major reduction in total cost that wouldn't happen as
2467 // individual folds.
2468 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2469 TTI::TargetCostKind CostKind) -> bool {
2470 Value *InnerOp;
2471 ArrayRef<int> InnerMask;
2472 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2473 m_Mask(InnerMask)))) &&
2474 InnerOp->getType() == Op->getType() &&
2475 all_of(InnerMask,
2476 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2477 for (int &M : Mask)
2478 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2479 M = InnerMask[M - Offset];
2480 M = 0 <= M ? M + Offset : M;
2481 }
2483 Op = InnerOp;
2484 return true;
2485 }
2486 return false;
2487 };
2488 bool ReducedInstCount = false;
2489 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2490 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2491 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2492 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2493
2494 auto *ShuffleCmpTy =
2495 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2496 InstructionCost NewCost =
2497 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2498 nullptr, {X, Z}) +
2499 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2500 nullptr, {Y, W});
2501
2502 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2503 NewCost +=
2504 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2505 } else {
2506 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2507 ShuffleDstTy, PredLHS, CostKind);
2508 }
2509
2510 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2511 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2512 << "\n");
2513
2514 // If either shuffle will constant fold away, then fold for the same cost as
2515 // we will reduce the instruction count.
2516 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2517 (isa<Constant>(Y) && isa<Constant>(W));
2518 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2519 return false;
2520
2521 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2522 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2523 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2524 ? Builder.CreateBinOp(
2525 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2526 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2527
2528 // Intersect flags from the old binops.
2529 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2530 NewInst->copyIRFlags(LHS);
2531 NewInst->andIRFlags(RHS);
2532 }
2533
2534 Worklist.pushValue(Shuf0);
2535 Worklist.pushValue(Shuf1);
2536 replaceValue(I, *NewBO);
2537 return true;
2538}
2539
2540/// Try to convert,
2541/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2542/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2543bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2544 ArrayRef<int> Mask;
2545 Value *C1, *T1, *F1, *C2, *T2, *F2;
2546 if (!match(&I, m_Shuffle(
2548 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2549 m_Mask(Mask))))
2550 return false;
2551
2552 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2553 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2554 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2555 return false;
2556
2557 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2558 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2559 // SelectInsts must have the same FMF.
2560 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2561 ((SI0FOp != nullptr) &&
2562 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2563 return false;
2564
2565 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2566 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2568 auto SelOp = Instruction::Select;
2570 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2571 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2573 OldCost +=
2574 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2575 {I.getOperand(0), I.getOperand(1)}, &I);
2576
2578 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2579 Mask, CostKind, 0, nullptr, {C1, C2});
2580 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2581 nullptr, {T1, T2});
2582 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2583 nullptr, {F1, F2});
2584 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2585 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2586 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2588
2589 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2590 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2591 << "\n");
2592 if (NewCost > OldCost)
2593 return false;
2594
2595 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2596 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2597 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2598 Value *NewSel;
2599 // We presuppose that the SelectInsts have the same FMF.
2600 if (SI0FOp)
2601 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2602 SI0FOp->getFastMathFlags());
2603 else
2604 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2605
2606 Worklist.pushValue(ShuffleCmp);
2607 Worklist.pushValue(ShuffleTrue);
2608 Worklist.pushValue(ShuffleFalse);
2609 replaceValue(I, *NewSel);
2610 return true;
2611}
2612
2613/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2614/// into "castop (shuffle)".
2615bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2616 Value *V0, *V1;
2617 ArrayRef<int> OldMask;
2618 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2619 return false;
2620
2621 // Check whether this is a binary shuffle.
2622 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2623
2624 auto *C0 = dyn_cast<CastInst>(V0);
2625 auto *C1 = dyn_cast<CastInst>(V1);
2626 if (!C0 || (IsBinaryShuffle && !C1))
2627 return false;
2628
2629 Instruction::CastOps Opcode = C0->getOpcode();
2630
2631 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2632 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2633 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2634 return false;
2635
2636 if (IsBinaryShuffle) {
2637 if (C0->getSrcTy() != C1->getSrcTy())
2638 return false;
2639 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2640 if (Opcode != C1->getOpcode()) {
2641 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2642 Opcode = Instruction::SExt;
2643 else
2644 return false;
2645 }
2646 }
2647
2648 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2649 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2650 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2651 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2652 return false;
2653
2654 unsigned NumSrcElts = CastSrcTy->getNumElements();
2655 unsigned NumDstElts = CastDstTy->getNumElements();
2656 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2657 "Only bitcasts expected to alter src/dst element counts");
2658
2659 // Check for bitcasting of unscalable vector types.
2660 // e.g. <32 x i40> -> <40 x i32>
2661 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2662 (NumDstElts % NumSrcElts) != 0)
2663 return false;
2664
2665 SmallVector<int, 16> NewMask;
2666 if (NumSrcElts >= NumDstElts) {
2667 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2668 // always be expanded to the equivalent form choosing narrower elements.
2669 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2670 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2671 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2672 } else {
2673 // The bitcast is from narrow elements to wide elements. The shuffle mask
2674 // must choose consecutive elements to allow casting first.
2675 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2676 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2677 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2678 return false;
2679 }
2680
2681 auto *NewShuffleDstTy =
2682 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2683
2684 // Try to replace a castop with a shuffle if the shuffle is not costly.
2685 InstructionCost CostC0 =
2686 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2688
2690 if (IsBinaryShuffle)
2692 else
2694
2695 InstructionCost OldCost = CostC0;
2696 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2697 CostKind, 0, nullptr, {}, &I);
2698
2699 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2700 CastSrcTy, NewMask, CostKind);
2701 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2703 if (!C0->hasOneUse())
2704 NewCost += CostC0;
2705 if (IsBinaryShuffle) {
2706 InstructionCost CostC1 =
2707 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2709 OldCost += CostC1;
2710 if (!C1->hasOneUse())
2711 NewCost += CostC1;
2712 }
2713
2714 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2715 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2716 << "\n");
2717 if (NewCost > OldCost)
2718 return false;
2719
2720 Value *Shuf;
2721 if (IsBinaryShuffle)
2722 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2723 NewMask);
2724 else
2725 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2726
2727 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2728
2729 // Intersect flags from the old casts.
2730 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2731 NewInst->copyIRFlags(C0);
2732 if (IsBinaryShuffle)
2733 NewInst->andIRFlags(C1);
2734 }
2735
2736 Worklist.pushValue(Shuf);
2737 replaceValue(I, *Cast);
2738 return true;
2739}
2740
2741/// Try to convert any of:
2742/// "shuffle (shuffle x, y), (shuffle y, x)"
2743/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2744/// "shuffle (shuffle x, undef), y"
2745/// "shuffle x, (shuffle y, undef)"
2746/// into "shuffle x, y".
2747bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2748 ArrayRef<int> OuterMask;
2749 Value *OuterV0, *OuterV1;
2750 if (!match(&I,
2751 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2752 return false;
2753
2754 ArrayRef<int> InnerMask0, InnerMask1;
2755 Value *X0, *X1, *Y0, *Y1;
2756 bool Match0 =
2757 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2758 bool Match1 =
2759 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2760 if (!Match0 && !Match1)
2761 return false;
2762
2763 // If the outer shuffle is a permute, then create a fake inner all-poison
2764 // shuffle. This is easier than accounting for length-changing shuffles below.
2765 SmallVector<int, 16> PoisonMask1;
2766 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2767 X1 = X0;
2768 Y1 = Y0;
2769 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2770 InnerMask1 = PoisonMask1;
2771 Match1 = true; // fake match
2772 }
2773
2774 X0 = Match0 ? X0 : OuterV0;
2775 Y0 = Match0 ? Y0 : OuterV0;
2776 X1 = Match1 ? X1 : OuterV1;
2777 Y1 = Match1 ? Y1 : OuterV1;
2778 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2779 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2780 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2781 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2782 X0->getType() != X1->getType())
2783 return false;
2784
2785 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2786 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2787
2788 // Attempt to merge shuffles, matching upto 2 source operands.
2789 // Replace index to a poison arg with PoisonMaskElem.
2790 // Bail if either inner masks reference an undef arg.
2791 SmallVector<int, 16> NewMask(OuterMask);
2792 Value *NewX = nullptr, *NewY = nullptr;
2793 for (int &M : NewMask) {
2794 Value *Src = nullptr;
2795 if (0 <= M && M < (int)NumImmElts) {
2796 Src = OuterV0;
2797 if (Match0) {
2798 M = InnerMask0[M];
2799 Src = M >= (int)NumSrcElts ? Y0 : X0;
2800 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2801 }
2802 } else if (M >= (int)NumImmElts) {
2803 Src = OuterV1;
2804 M -= NumImmElts;
2805 if (Match1) {
2806 M = InnerMask1[M];
2807 Src = M >= (int)NumSrcElts ? Y1 : X1;
2808 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2809 }
2810 }
2811 if (Src && M != PoisonMaskElem) {
2812 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2813 if (isa<UndefValue>(Src)) {
2814 // We've referenced an undef element - if its poison, update the shuffle
2815 // mask, else bail.
2816 if (!isa<PoisonValue>(Src))
2817 return false;
2818 M = PoisonMaskElem;
2819 continue;
2820 }
2821 if (!NewX || NewX == Src) {
2822 NewX = Src;
2823 continue;
2824 }
2825 if (!NewY || NewY == Src) {
2826 M += NumSrcElts;
2827 NewY = Src;
2828 continue;
2829 }
2830 return false;
2831 }
2832 }
2833
2834 if (!NewX)
2835 return PoisonValue::get(ShuffleDstTy);
2836 if (!NewY)
2837 NewY = PoisonValue::get(ShuffleSrcTy);
2838
2839 // Have we folded to an Identity shuffle?
2840 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2841 replaceValue(I, *NewX);
2842 return true;
2843 }
2844
2845 // Try to merge the shuffles if the new shuffle is not costly.
2846 InstructionCost InnerCost0 = 0;
2847 if (Match0)
2848 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2849
2850 InstructionCost InnerCost1 = 0;
2851 if (Match1)
2852 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2853
2855
2856 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2857
2858 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2862 InstructionCost NewCost =
2863 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2864 nullptr, {NewX, NewY});
2865 if (!OuterV0->hasOneUse())
2866 NewCost += InnerCost0;
2867 if (!OuterV1->hasOneUse())
2868 NewCost += InnerCost1;
2869
2870 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2871 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2872 << "\n");
2873 if (NewCost > OldCost)
2874 return false;
2875
2876 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2877 replaceValue(I, *Shuf);
2878 return true;
2879}
2880
2881/// Try to convert
2882/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2883bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2884 Value *V0, *V1;
2885 ArrayRef<int> OldMask;
2886 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2887 m_Mask(OldMask))))
2888 return false;
2889
2890 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2891 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2892 if (!II0 || !II1)
2893 return false;
2894
2895 Intrinsic::ID IID = II0->getIntrinsicID();
2896 if (IID != II1->getIntrinsicID())
2897 return false;
2898
2899 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2900 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2901 if (!ShuffleDstTy || !II0Ty)
2902 return false;
2903
2904 if (!isTriviallyVectorizable(IID))
2905 return false;
2906
2907 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2909 II0->getArgOperand(I) != II1->getArgOperand(I))
2910 return false;
2911
2912 InstructionCost OldCost =
2913 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2914 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2916 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2917
2918 SmallVector<Type *> NewArgsTy;
2919 InstructionCost NewCost = 0;
2920 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2922 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2923 } else {
2924 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2925 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2926 ShuffleDstTy->getNumElements());
2927 NewArgsTy.push_back(ArgTy);
2928 NewCost += TTI.getShuffleCost(
2929 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
2930 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
2931 }
2932 }
2933 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2934 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2935
2936 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2937 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2938 << "\n");
2939
2940 if (NewCost > OldCost)
2941 return false;
2942
2943 SmallVector<Value *> NewArgs;
2944 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2946 NewArgs.push_back(II0->getArgOperand(I));
2947 } else {
2948 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2949 II1->getArgOperand(I), OldMask);
2950 NewArgs.push_back(Shuf);
2951 Worklist.pushValue(Shuf);
2952 }
2953 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2954
2955 // Intersect flags from the old intrinsics.
2956 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2957 NewInst->copyIRFlags(II0);
2958 NewInst->andIRFlags(II1);
2959 }
2960
2961 replaceValue(I, *NewIntrinsic);
2962 return true;
2963}
2964
2965/// Try to convert
2966/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
2967bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
2968 Value *V0;
2969 ArrayRef<int> Mask;
2970 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
2971 return false;
2972
2973 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2974 if (!II0)
2975 return false;
2976
2977 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2978 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
2979 if (!ShuffleDstTy || !IntrinsicSrcTy)
2980 return false;
2981
2982 // Validate it's a pure permute, mask should only reference the first vector
2983 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
2984 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2985 return false;
2986
2987 Intrinsic::ID IID = II0->getIntrinsicID();
2988 if (!isTriviallyVectorizable(IID))
2989 return false;
2990
2991 // Cost analysis
2992 InstructionCost OldCost =
2993 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2995 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
2996
2997 SmallVector<Type *> NewArgsTy;
2998 InstructionCost NewCost = 0;
2999 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3001 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3002 } else {
3003 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3004 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3005 ShuffleDstTy->getNumElements());
3006 NewArgsTy.push_back(ArgTy);
3008 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3009 {II0->getArgOperand(I)});
3010 }
3011 }
3012 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3013 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3014
3015 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3016 << OldCost << " vs NewCost: " << NewCost << "\n");
3017
3018 if (NewCost > OldCost)
3019 return false;
3020
3021 // Transform
3022 SmallVector<Value *> NewArgs;
3023 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3025 NewArgs.push_back(II0->getArgOperand(I));
3026 } else {
3027 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3028 NewArgs.push_back(Shuf);
3029 Worklist.pushValue(Shuf);
3030 }
3031 }
3032
3033 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3034
3035 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3036 NewInst->copyIRFlags(II0);
3037
3038 replaceValue(I, *NewIntrinsic);
3039 return true;
3040}
3041
3042using InstLane = std::pair<Use *, int>;
3043
3044static InstLane lookThroughShuffles(Use *U, int Lane) {
3045 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3046 unsigned NumElts =
3047 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3048 int M = SV->getMaskValue(Lane);
3049 if (M < 0)
3050 return {nullptr, PoisonMaskElem};
3051 if (static_cast<unsigned>(M) < NumElts) {
3052 U = &SV->getOperandUse(0);
3053 Lane = M;
3054 } else {
3055 U = &SV->getOperandUse(1);
3056 Lane = M - NumElts;
3057 }
3058 }
3059 return InstLane{U, Lane};
3060}
3061
3065 for (InstLane IL : Item) {
3066 auto [U, Lane] = IL;
3067 InstLane OpLane =
3068 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3069 Lane)
3070 : InstLane{nullptr, PoisonMaskElem};
3071 NItem.emplace_back(OpLane);
3072 }
3073 return NItem;
3074}
3075
3076/// Detect concat of multiple values into a vector
3078 const TargetTransformInfo &TTI) {
3079 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3080 unsigned NumElts = Ty->getNumElements();
3081 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3082 return false;
3083
3084 // Check that the concat is free, usually meaning that the type will be split
3085 // during legalization.
3086 SmallVector<int, 16> ConcatMask(NumElts * 2);
3087 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3088 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3089 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3090 Ty, ConcatMask, CostKind) != 0)
3091 return false;
3092
3093 unsigned NumSlices = Item.size() / NumElts;
3094 // Currently we generate a tree of shuffles for the concats, which limits us
3095 // to a power2.
3096 if (!isPowerOf2_32(NumSlices))
3097 return false;
3098 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3099 Use *SliceV = Item[Slice * NumElts].first;
3100 if (!SliceV || SliceV->get()->getType() != Ty)
3101 return false;
3102 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3103 auto [V, Lane] = Item[Slice * NumElts + Elt];
3104 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3105 return false;
3106 }
3107 }
3108 return true;
3109}
3110
3112 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3113 const SmallPtrSet<Use *, 4> &SplatLeafs,
3114 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3115 IRBuilderBase &Builder,
3116 const TargetTransformInfo *TTI) {
3117 auto [FrontU, FrontLane] = Item.front();
3118
3119 if (IdentityLeafs.contains(FrontU)) {
3120 return FrontU->get();
3121 }
3122 if (SplatLeafs.contains(FrontU)) {
3123 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3124 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3125 }
3126 if (ConcatLeafs.contains(FrontU)) {
3127 unsigned NumElts =
3128 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3129 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3130 for (unsigned S = 0; S < Values.size(); ++S)
3131 Values[S] = Item[S * NumElts].first->get();
3132
3133 while (Values.size() > 1) {
3134 NumElts *= 2;
3135 SmallVector<int, 16> Mask(NumElts, 0);
3136 std::iota(Mask.begin(), Mask.end(), 0);
3137 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3138 for (unsigned S = 0; S < NewValues.size(); ++S)
3139 NewValues[S] =
3140 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3141 Values = NewValues;
3142 }
3143 return Values[0];
3144 }
3145
3146 auto *I = cast<Instruction>(FrontU->get());
3147 auto *II = dyn_cast<IntrinsicInst>(I);
3148 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3150 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3151 if (II &&
3152 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3153 Ops[Idx] = II->getOperand(Idx);
3154 continue;
3155 }
3157 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3158 Builder, TTI);
3159 }
3160
3161 SmallVector<Value *, 8> ValueList;
3162 for (const auto &Lane : Item)
3163 if (Lane.first)
3164 ValueList.push_back(Lane.first->get());
3165
3166 Type *DstTy =
3167 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3168 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3169 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3170 Ops[0], Ops[1]);
3171 propagateIRFlags(Value, ValueList);
3172 return Value;
3173 }
3174 if (auto *CI = dyn_cast<CmpInst>(I)) {
3175 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3176 propagateIRFlags(Value, ValueList);
3177 return Value;
3178 }
3179 if (auto *SI = dyn_cast<SelectInst>(I)) {
3180 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3181 propagateIRFlags(Value, ValueList);
3182 return Value;
3183 }
3184 if (auto *CI = dyn_cast<CastInst>(I)) {
3185 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3186 propagateIRFlags(Value, ValueList);
3187 return Value;
3188 }
3189 if (II) {
3190 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3191 propagateIRFlags(Value, ValueList);
3192 return Value;
3193 }
3194 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3195 auto *Value =
3196 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3197 propagateIRFlags(Value, ValueList);
3198 return Value;
3199}
3200
3201// Starting from a shuffle, look up through operands tracking the shuffled index
3202// of each lane. If we can simplify away the shuffles to identities then
3203// do so.
3204bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3205 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3206 if (!Ty || I.use_empty())
3207 return false;
3208
3209 SmallVector<InstLane> Start(Ty->getNumElements());
3210 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3211 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3212
3214 Worklist.push_back(Start);
3215 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3216 unsigned NumVisited = 0;
3217
3218 while (!Worklist.empty()) {
3219 if (++NumVisited > MaxInstrsToScan)
3220 return false;
3221
3222 SmallVector<InstLane> Item = Worklist.pop_back_val();
3223 auto [FrontU, FrontLane] = Item.front();
3224
3225 // If we found an undef first lane then bail out to keep things simple.
3226 if (!FrontU)
3227 return false;
3228
3229 // Helper to peek through bitcasts to the same value.
3230 auto IsEquiv = [&](Value *X, Value *Y) {
3231 return X->getType() == Y->getType() &&
3233 };
3234
3235 // Look for an identity value.
3236 if (FrontLane == 0 &&
3237 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3238 Ty->getNumElements() &&
3239 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3240 Value *FrontV = Item.front().first->get();
3241 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3242 E.value().second == (int)E.index());
3243 })) {
3244 IdentityLeafs.insert(FrontU);
3245 continue;
3246 }
3247 // Look for constants, for the moment only supporting constant splats.
3248 if (auto *C = dyn_cast<Constant>(FrontU);
3249 C && C->getSplatValue() &&
3250 all_of(drop_begin(Item), [Item](InstLane &IL) {
3251 Value *FrontV = Item.front().first->get();
3252 Use *U = IL.first;
3253 return !U || (isa<Constant>(U->get()) &&
3254 cast<Constant>(U->get())->getSplatValue() ==
3255 cast<Constant>(FrontV)->getSplatValue());
3256 })) {
3257 SplatLeafs.insert(FrontU);
3258 continue;
3259 }
3260 // Look for a splat value.
3261 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3262 auto [FrontU, FrontLane] = Item.front();
3263 auto [U, Lane] = IL;
3264 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3265 })) {
3266 SplatLeafs.insert(FrontU);
3267 continue;
3268 }
3269
3270 // We need each element to be the same type of value, and check that each
3271 // element has a single use.
3272 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3273 Value *FrontV = Item.front().first->get();
3274 if (!IL.first)
3275 return true;
3276 Value *V = IL.first->get();
3277 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3278 return false;
3279 if (V->getValueID() != FrontV->getValueID())
3280 return false;
3281 if (auto *CI = dyn_cast<CmpInst>(V))
3282 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3283 return false;
3284 if (auto *CI = dyn_cast<CastInst>(V))
3285 if (CI->getSrcTy()->getScalarType() !=
3286 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3287 return false;
3288 if (auto *SI = dyn_cast<SelectInst>(V))
3289 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3290 SI->getOperand(0)->getType() !=
3291 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3292 return false;
3293 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3294 return false;
3295 auto *II = dyn_cast<IntrinsicInst>(V);
3296 return !II || (isa<IntrinsicInst>(FrontV) &&
3297 II->getIntrinsicID() ==
3298 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3299 !II->hasOperandBundles());
3300 };
3301 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3302 // Check the operator is one that we support.
3303 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3304 // We exclude div/rem in case they hit UB from poison lanes.
3305 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3306 BO && BO->isIntDivRem())
3307 return false;
3310 continue;
3311 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3312 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3314 continue;
3315 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3316 // TODO: Handle vector widening/narrowing bitcasts.
3317 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3318 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3319 if (DstTy && SrcTy &&
3320 SrcTy->getNumElements() == DstTy->getNumElements()) {
3322 continue;
3323 }
3324 } else if (isa<SelectInst>(FrontU)) {
3328 continue;
3329 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3330 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3331 !II->hasOperandBundles()) {
3332 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3333 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3334 &TTI)) {
3335 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3336 Value *FrontV = Item.front().first->get();
3337 Use *U = IL.first;
3338 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3339 cast<Instruction>(FrontV)->getOperand(Op));
3340 }))
3341 return false;
3342 continue;
3343 }
3345 }
3346 continue;
3347 }
3348 }
3349
3350 if (isFreeConcat(Item, CostKind, TTI)) {
3351 ConcatLeafs.insert(FrontU);
3352 continue;
3353 }
3354
3355 return false;
3356 }
3357
3358 if (NumVisited <= 1)
3359 return false;
3360
3361 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3362
3363 // If we got this far, we know the shuffles are superfluous and can be
3364 // removed. Scan through again and generate the new tree of instructions.
3365 Builder.SetInsertPoint(&I);
3366 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3367 ConcatLeafs, Builder, &TTI);
3368 replaceValue(I, *V);
3369 return true;
3370}
3371
3372/// Given a commutative reduction, the order of the input lanes does not alter
3373/// the results. We can use this to remove certain shuffles feeding the
3374/// reduction, removing the need to shuffle at all.
3375bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3376 auto *II = dyn_cast<IntrinsicInst>(&I);
3377 if (!II)
3378 return false;
3379 switch (II->getIntrinsicID()) {
3380 case Intrinsic::vector_reduce_add:
3381 case Intrinsic::vector_reduce_mul:
3382 case Intrinsic::vector_reduce_and:
3383 case Intrinsic::vector_reduce_or:
3384 case Intrinsic::vector_reduce_xor:
3385 case Intrinsic::vector_reduce_smin:
3386 case Intrinsic::vector_reduce_smax:
3387 case Intrinsic::vector_reduce_umin:
3388 case Intrinsic::vector_reduce_umax:
3389 break;
3390 default:
3391 return false;
3392 }
3393
3394 // Find all the inputs when looking through operations that do not alter the
3395 // lane order (binops, for example). Currently we look for a single shuffle,
3396 // and can ignore splat values.
3397 std::queue<Value *> Worklist;
3398 SmallPtrSet<Value *, 4> Visited;
3399 ShuffleVectorInst *Shuffle = nullptr;
3400 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3401 Worklist.push(Op);
3402
3403 while (!Worklist.empty()) {
3404 Value *CV = Worklist.front();
3405 Worklist.pop();
3406 if (Visited.contains(CV))
3407 continue;
3408
3409 // Splats don't change the order, so can be safely ignored.
3410 if (isSplatValue(CV))
3411 continue;
3412
3413 Visited.insert(CV);
3414
3415 if (auto *CI = dyn_cast<Instruction>(CV)) {
3416 if (CI->isBinaryOp()) {
3417 for (auto *Op : CI->operand_values())
3418 Worklist.push(Op);
3419 continue;
3420 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3421 if (Shuffle && Shuffle != SV)
3422 return false;
3423 Shuffle = SV;
3424 continue;
3425 }
3426 }
3427
3428 // Anything else is currently an unknown node.
3429 return false;
3430 }
3431
3432 if (!Shuffle)
3433 return false;
3434
3435 // Check all uses of the binary ops and shuffles are also included in the
3436 // lane-invariant operations (Visited should be the list of lanewise
3437 // instructions, including the shuffle that we found).
3438 for (auto *V : Visited)
3439 for (auto *U : V->users())
3440 if (!Visited.contains(U) && U != &I)
3441 return false;
3442
3443 FixedVectorType *VecType =
3444 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3445 if (!VecType)
3446 return false;
3447 FixedVectorType *ShuffleInputType =
3449 if (!ShuffleInputType)
3450 return false;
3451 unsigned NumInputElts = ShuffleInputType->getNumElements();
3452
3453 // Find the mask from sorting the lanes into order. This is most likely to
3454 // become a identity or concat mask. Undef elements are pushed to the end.
3455 SmallVector<int> ConcatMask;
3456 Shuffle->getShuffleMask(ConcatMask);
3457 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3458 bool UsesSecondVec =
3459 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3460
3462 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3463 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3465 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3466 ShuffleInputType, ConcatMask, CostKind);
3467
3468 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3469 << "\n");
3470 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3471 << "\n");
3472 bool MadeChanges = false;
3473 if (NewCost < OldCost) {
3474 Builder.SetInsertPoint(Shuffle);
3475 Value *NewShuffle = Builder.CreateShuffleVector(
3476 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3477 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3478 replaceValue(*Shuffle, *NewShuffle);
3479 return true;
3480 }
3481
3482 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3483 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3484 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3485 return MadeChanges;
3486}
3487
3488/// For a given chain of patterns of the following form:
3489///
3490/// ```
3491/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3492///
3493/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3494/// ty1> %1)
3495/// OR
3496/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3497///
3498/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3499/// ...
3500/// ...
3501/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3502/// 3), <n x ty1> %(i - 2)
3503/// OR
3504/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3505///
3506/// %(i) = extractelement <n x ty1> %(i - 1), 0
3507/// ```
3508///
3509/// Where:
3510/// `mask` follows a partition pattern:
3511///
3512/// Ex:
3513/// [n = 8, p = poison]
3514///
3515/// 4 5 6 7 | p p p p
3516/// 2 3 | p p p p p p
3517/// 1 | p p p p p p p
3518///
3519/// For powers of 2, there's a consistent pattern, but for other cases
3520/// the parity of the current half value at each step decides the
3521/// next partition half (see `ExpectedParityMask` for more logical details
3522/// in generalising this).
3523///
3524/// Ex:
3525/// [n = 6]
3526///
3527/// 3 4 5 | p p p
3528/// 1 2 | p p p p
3529/// 1 | p p p p p
3530bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3531 // Going bottom-up for the pattern.
3532 std::queue<Value *> InstWorklist;
3533 InstructionCost OrigCost = 0;
3534
3535 // Common instruction operation after each shuffle op.
3536 std::optional<unsigned int> CommonCallOp = std::nullopt;
3537 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3538
3539 bool IsFirstCallOrBinInst = true;
3540 bool ShouldBeCallOrBinInst = true;
3541
3542 // This stores the last used instructions for shuffle/common op.
3543 //
3544 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3545 // instructions from either shuffle/common op.
3546 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3547
3548 Value *VecOpEE;
3549 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3550 return false;
3551
3552 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3553 if (!FVT)
3554 return false;
3555
3556 int64_t VecSize = FVT->getNumElements();
3557 if (VecSize < 2)
3558 return false;
3559
3560 // Number of levels would be ~log2(n), considering we always partition
3561 // by half for this fold pattern.
3562 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3563 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3564
3565 // This is how we generalise for all element sizes.
3566 // At each step, if vector size is odd, we need non-poison
3567 // values to cover the dominant half so we don't miss out on any element.
3568 //
3569 // This mask will help us retrieve this as we go from bottom to top:
3570 //
3571 // Mask Set -> N = N * 2 - 1
3572 // Mask Unset -> N = N * 2
3573 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3574 Cur = (Cur + 1) / 2, --Mask) {
3575 if (Cur & 1)
3576 ExpectedParityMask |= (1ll << Mask);
3577 }
3578
3579 InstWorklist.push(VecOpEE);
3580
3581 while (!InstWorklist.empty()) {
3582 Value *CI = InstWorklist.front();
3583 InstWorklist.pop();
3584
3585 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3586 if (!ShouldBeCallOrBinInst)
3587 return false;
3588
3589 if (!IsFirstCallOrBinInst &&
3590 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3591 return false;
3592
3593 // For the first found call/bin op, the vector has to come from the
3594 // extract element op.
3595 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3596 return false;
3597 IsFirstCallOrBinInst = false;
3598
3599 if (!CommonCallOp)
3600 CommonCallOp = II->getIntrinsicID();
3601 if (II->getIntrinsicID() != *CommonCallOp)
3602 return false;
3603
3604 switch (II->getIntrinsicID()) {
3605 case Intrinsic::umin:
3606 case Intrinsic::umax:
3607 case Intrinsic::smin:
3608 case Intrinsic::smax: {
3609 auto *Op0 = II->getOperand(0);
3610 auto *Op1 = II->getOperand(1);
3611 PrevVecV[0] = Op0;
3612 PrevVecV[1] = Op1;
3613 break;
3614 }
3615 default:
3616 return false;
3617 }
3618 ShouldBeCallOrBinInst ^= 1;
3619
3620 IntrinsicCostAttributes ICA(
3621 *CommonCallOp, II->getType(),
3622 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3623 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3624
3625 // We may need a swap here since it can be (a, b) or (b, a)
3626 // and accordingly change as we go up.
3627 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3628 std::swap(PrevVecV[0], PrevVecV[1]);
3629 InstWorklist.push(PrevVecV[1]);
3630 InstWorklist.push(PrevVecV[0]);
3631 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3632 // Similar logic for bin ops.
3633
3634 if (!ShouldBeCallOrBinInst)
3635 return false;
3636
3637 if (!IsFirstCallOrBinInst &&
3638 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3639 return false;
3640
3641 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3642 return false;
3643 IsFirstCallOrBinInst = false;
3644
3645 if (!CommonBinOp)
3646 CommonBinOp = BinOp->getOpcode();
3647
3648 if (BinOp->getOpcode() != *CommonBinOp)
3649 return false;
3650
3651 switch (*CommonBinOp) {
3652 case BinaryOperator::Add:
3653 case BinaryOperator::Mul:
3654 case BinaryOperator::Or:
3655 case BinaryOperator::And:
3656 case BinaryOperator::Xor: {
3657 auto *Op0 = BinOp->getOperand(0);
3658 auto *Op1 = BinOp->getOperand(1);
3659 PrevVecV[0] = Op0;
3660 PrevVecV[1] = Op1;
3661 break;
3662 }
3663 default:
3664 return false;
3665 }
3666 ShouldBeCallOrBinInst ^= 1;
3667
3668 OrigCost +=
3669 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3670
3671 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3672 std::swap(PrevVecV[0], PrevVecV[1]);
3673 InstWorklist.push(PrevVecV[1]);
3674 InstWorklist.push(PrevVecV[0]);
3675 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3676 // We shouldn't have any null values in the previous vectors,
3677 // is so, there was a mismatch in pattern.
3678 if (ShouldBeCallOrBinInst ||
3679 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3680 return false;
3681
3682 if (SVInst != PrevVecV[1])
3683 return false;
3684
3685 ArrayRef<int> CurMask;
3686 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3687 m_Mask(CurMask))))
3688 return false;
3689
3690 // Subtract the parity mask when checking the condition.
3691 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3692 if (Mask < ShuffleMaskHalf &&
3693 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3694 return false;
3695 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3696 return false;
3697 }
3698
3699 // Update mask values.
3700 ShuffleMaskHalf *= 2;
3701 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3702 ExpectedParityMask >>= 1;
3703
3705 SVInst->getType(), SVInst->getType(),
3706 CurMask, CostKind);
3707
3708 VisitedCnt += 1;
3709 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3710 break;
3711
3712 ShouldBeCallOrBinInst ^= 1;
3713 } else {
3714 return false;
3715 }
3716 }
3717
3718 // Pattern should end with a shuffle op.
3719 if (ShouldBeCallOrBinInst)
3720 return false;
3721
3722 assert(VecSize != -1 && "Expected Match for Vector Size");
3723
3724 Value *FinalVecV = PrevVecV[0];
3725 if (!FinalVecV)
3726 return false;
3727
3728 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3729
3730 Intrinsic::ID ReducedOp =
3731 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3732 : getReductionForBinop(*CommonBinOp));
3733 if (!ReducedOp)
3734 return false;
3735
3736 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3738
3739 if (NewCost >= OrigCost)
3740 return false;
3741
3742 auto *ReducedResult =
3743 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3744 replaceValue(I, *ReducedResult);
3745
3746 return true;
3747}
3748
3749/// Determine if its more efficient to fold:
3750/// reduce(trunc(x)) -> trunc(reduce(x)).
3751/// reduce(sext(x)) -> sext(reduce(x)).
3752/// reduce(zext(x)) -> zext(reduce(x)).
3753bool VectorCombine::foldCastFromReductions(Instruction &I) {
3754 auto *II = dyn_cast<IntrinsicInst>(&I);
3755 if (!II)
3756 return false;
3757
3758 bool TruncOnly = false;
3759 Intrinsic::ID IID = II->getIntrinsicID();
3760 switch (IID) {
3761 case Intrinsic::vector_reduce_add:
3762 case Intrinsic::vector_reduce_mul:
3763 TruncOnly = true;
3764 break;
3765 case Intrinsic::vector_reduce_and:
3766 case Intrinsic::vector_reduce_or:
3767 case Intrinsic::vector_reduce_xor:
3768 break;
3769 default:
3770 return false;
3771 }
3772
3773 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3774 Value *ReductionSrc = I.getOperand(0);
3775
3776 Value *Src;
3777 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3778 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3779 return false;
3780
3781 auto CastOpc =
3782 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3783
3784 auto *SrcTy = cast<VectorType>(Src->getType());
3785 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3786 Type *ResultTy = I.getType();
3787
3789 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3790 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3792 cast<CastInst>(ReductionSrc));
3793 InstructionCost NewCost =
3794 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3795 CostKind) +
3796 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3798
3799 if (OldCost <= NewCost || !NewCost.isValid())
3800 return false;
3801
3802 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3803 II->getIntrinsicID(), {Src});
3804 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3805 replaceValue(I, *NewCast);
3806 return true;
3807}
3808
3809/// Returns true if this ShuffleVectorInst eventually feeds into a
3810/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3811/// chains of shuffles and binary operators (in any combination/order).
3812/// The search does not go deeper than the given Depth.
3814 constexpr unsigned MaxVisited = 32;
3817 bool FoundReduction = false;
3818
3819 WorkList.push_back(SVI);
3820 while (!WorkList.empty()) {
3821 Instruction *I = WorkList.pop_back_val();
3822 for (User *U : I->users()) {
3823 auto *UI = cast<Instruction>(U);
3824 if (!UI || !Visited.insert(UI).second)
3825 continue;
3826 if (Visited.size() > MaxVisited)
3827 return false;
3828 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3829 // More than one reduction reached
3830 if (FoundReduction)
3831 return false;
3832 switch (II->getIntrinsicID()) {
3833 case Intrinsic::vector_reduce_add:
3834 case Intrinsic::vector_reduce_mul:
3835 case Intrinsic::vector_reduce_and:
3836 case Intrinsic::vector_reduce_or:
3837 case Intrinsic::vector_reduce_xor:
3838 case Intrinsic::vector_reduce_smin:
3839 case Intrinsic::vector_reduce_smax:
3840 case Intrinsic::vector_reduce_umin:
3841 case Intrinsic::vector_reduce_umax:
3842 FoundReduction = true;
3843 continue;
3844 default:
3845 return false;
3846 }
3847 }
3848
3850 return false;
3851
3852 WorkList.emplace_back(UI);
3853 }
3854 }
3855 return FoundReduction;
3856}
3857
3858/// This method looks for groups of shuffles acting on binops, of the form:
3859/// %x = shuffle ...
3860/// %y = shuffle ...
3861/// %a = binop %x, %y
3862/// %b = binop %x, %y
3863/// shuffle %a, %b, selectmask
3864/// We may, especially if the shuffle is wider than legal, be able to convert
3865/// the shuffle to a form where only parts of a and b need to be computed. On
3866/// architectures with no obvious "select" shuffle, this can reduce the total
3867/// number of operations if the target reports them as cheaper.
3868bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3869 auto *SVI = cast<ShuffleVectorInst>(&I);
3870 auto *VT = cast<FixedVectorType>(I.getType());
3871 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3872 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3873 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3874 VT != Op0->getType())
3875 return false;
3876
3877 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3878 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3879 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3880 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3881 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3882 auto checkSVNonOpUses = [&](Instruction *I) {
3883 if (!I || I->getOperand(0)->getType() != VT)
3884 return true;
3885 return any_of(I->users(), [&](User *U) {
3886 return U != Op0 && U != Op1 &&
3887 !(isa<ShuffleVectorInst>(U) &&
3888 (InputShuffles.contains(cast<Instruction>(U)) ||
3889 isInstructionTriviallyDead(cast<Instruction>(U))));
3890 });
3891 };
3892 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3893 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3894 return false;
3895
3896 // Collect all the uses that are shuffles that we can transform together. We
3897 // may not have a single shuffle, but a group that can all be transformed
3898 // together profitably.
3900 auto collectShuffles = [&](Instruction *I) {
3901 for (auto *U : I->users()) {
3902 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3903 if (!SV || SV->getType() != VT)
3904 return false;
3905 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3906 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3907 return false;
3908 if (!llvm::is_contained(Shuffles, SV))
3909 Shuffles.push_back(SV);
3910 }
3911 return true;
3912 };
3913 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3914 return false;
3915 // From a reduction, we need to be processing a single shuffle, otherwise the
3916 // other uses will not be lane-invariant.
3917 if (FromReduction && Shuffles.size() > 1)
3918 return false;
3919
3920 // Add any shuffle uses for the shuffles we have found, to include them in our
3921 // cost calculations.
3922 if (!FromReduction) {
3923 for (ShuffleVectorInst *SV : Shuffles) {
3924 for (auto *U : SV->users()) {
3925 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3926 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3927 Shuffles.push_back(SSV);
3928 }
3929 }
3930 }
3931
3932 // For each of the output shuffles, we try to sort all the first vector
3933 // elements to the beginning, followed by the second array elements at the
3934 // end. If the binops are legalized to smaller vectors, this may reduce total
3935 // number of binops. We compute the ReconstructMask mask needed to convert
3936 // back to the original lane order.
3938 SmallVector<SmallVector<int>> OrigReconstructMasks;
3939 int MaxV1Elt = 0, MaxV2Elt = 0;
3940 unsigned NumElts = VT->getNumElements();
3941 for (ShuffleVectorInst *SVN : Shuffles) {
3942 SmallVector<int> Mask;
3943 SVN->getShuffleMask(Mask);
3944
3945 // Check the operands are the same as the original, or reversed (in which
3946 // case we need to commute the mask).
3947 Value *SVOp0 = SVN->getOperand(0);
3948 Value *SVOp1 = SVN->getOperand(1);
3949 if (isa<UndefValue>(SVOp1)) {
3950 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3951 SVOp0 = SSV->getOperand(0);
3952 SVOp1 = SSV->getOperand(1);
3953 for (int &Elem : Mask) {
3954 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3955 return false;
3956 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3957 }
3958 }
3959 if (SVOp0 == Op1 && SVOp1 == Op0) {
3960 std::swap(SVOp0, SVOp1);
3962 }
3963 if (SVOp0 != Op0 || SVOp1 != Op1)
3964 return false;
3965
3966 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3967 // take the packed values from Op0/Op1 and reconstructing to the original
3968 // order.
3969 SmallVector<int> ReconstructMask;
3970 for (unsigned I = 0; I < Mask.size(); I++) {
3971 if (Mask[I] < 0) {
3972 ReconstructMask.push_back(-1);
3973 } else if (Mask[I] < static_cast<int>(NumElts)) {
3974 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3975 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3976 return Mask[I] == A.first;
3977 });
3978 if (It != V1.end())
3979 ReconstructMask.push_back(It - V1.begin());
3980 else {
3981 ReconstructMask.push_back(V1.size());
3982 V1.emplace_back(Mask[I], V1.size());
3983 }
3984 } else {
3985 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3986 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3987 return Mask[I] - static_cast<int>(NumElts) == A.first;
3988 });
3989 if (It != V2.end())
3990 ReconstructMask.push_back(NumElts + It - V2.begin());
3991 else {
3992 ReconstructMask.push_back(NumElts + V2.size());
3993 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3994 }
3995 }
3996 }
3997
3998 // For reductions, we know that the lane ordering out doesn't alter the
3999 // result. In-order can help simplify the shuffle away.
4000 if (FromReduction)
4001 sort(ReconstructMask);
4002 OrigReconstructMasks.push_back(std::move(ReconstructMask));
4003 }
4004
4005 // If the Maximum element used from V1 and V2 are not larger than the new
4006 // vectors, the vectors are already packes and performing the optimization
4007 // again will likely not help any further. This also prevents us from getting
4008 // stuck in a cycle in case the costs do not also rule it out.
4009 if (V1.empty() || V2.empty() ||
4010 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
4011 MaxV2Elt == static_cast<int>(V2.size()) - 1))
4012 return false;
4013
4014 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
4015 // shuffle of another shuffle, or not a shuffle (that is treated like a
4016 // identity shuffle).
4017 auto GetBaseMaskValue = [&](Instruction *I, int M) {
4018 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4019 if (!SV)
4020 return M;
4021 if (isa<UndefValue>(SV->getOperand(1)))
4022 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4023 if (InputShuffles.contains(SSV))
4024 return SSV->getMaskValue(SV->getMaskValue(M));
4025 return SV->getMaskValue(M);
4026 };
4027
4028 // Attempt to sort the inputs my ascending mask values to make simpler input
4029 // shuffles and push complex shuffles down to the uses. We sort on the first
4030 // of the two input shuffle orders, to try and get at least one input into a
4031 // nice order.
4032 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
4033 std::pair<int, int> Y) {
4034 int MXA = GetBaseMaskValue(A, X.first);
4035 int MYA = GetBaseMaskValue(A, Y.first);
4036 return MXA < MYA;
4037 };
4038 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
4039 return SortBase(SVI0A, A, B);
4040 });
4041 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
4042 return SortBase(SVI1A, A, B);
4043 });
4044 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
4045 // modified order of the input shuffles.
4046 SmallVector<SmallVector<int>> ReconstructMasks;
4047 for (const auto &Mask : OrigReconstructMasks) {
4048 SmallVector<int> ReconstructMask;
4049 for (int M : Mask) {
4050 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
4051 auto It = find_if(V, [M](auto A) { return A.second == M; });
4052 assert(It != V.end() && "Expected all entries in Mask");
4053 return std::distance(V.begin(), It);
4054 };
4055 if (M < 0)
4056 ReconstructMask.push_back(-1);
4057 else if (M < static_cast<int>(NumElts)) {
4058 ReconstructMask.push_back(FindIndex(V1, M));
4059 } else {
4060 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
4061 }
4062 }
4063 ReconstructMasks.push_back(std::move(ReconstructMask));
4064 }
4065
4066 // Calculate the masks needed for the new input shuffles, which get padded
4067 // with undef
4068 SmallVector<int> V1A, V1B, V2A, V2B;
4069 for (unsigned I = 0; I < V1.size(); I++) {
4070 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
4071 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
4072 }
4073 for (unsigned I = 0; I < V2.size(); I++) {
4074 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
4075 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
4076 }
4077 while (V1A.size() < NumElts) {
4080 }
4081 while (V2A.size() < NumElts) {
4084 }
4085
4086 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4087 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4088 if (!SV)
4089 return C;
4090 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4093 VT, VT, SV->getShuffleMask(), CostKind);
4094 };
4095 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4096 return C +
4098 };
4099
4100 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4101 unsigned MaxVectorSize =
4103 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4104 if (MaxElementsInVector == 0)
4105 return false;
4106 // When there are multiple shufflevector operations on the same input,
4107 // especially when the vector length is larger than the register size,
4108 // identical shuffle patterns may occur across different groups of elements.
4109 // To avoid overestimating the cost by counting these repeated shuffles more
4110 // than once, we only account for unique shuffle patterns. This adjustment
4111 // prevents inflated costs in the cost model for wide vectors split into
4112 // several register-sized groups.
4113 std::set<SmallVector<int, 4>> UniqueShuffles;
4114 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4115 // Compute the cost for performing the shuffle over the full vector.
4116 auto ShuffleCost =
4118 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4119 if (NumFullVectors < 2)
4120 return C + ShuffleCost;
4121 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4122 unsigned NumUniqueGroups = 0;
4123 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4124 // For each group of MaxElementsInVector contiguous elements,
4125 // collect their shuffle pattern and insert into the set of unique patterns.
4126 for (unsigned I = 0; I < NumFullVectors; ++I) {
4127 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4128 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4129 if (UniqueShuffles.insert(SubShuffle).second)
4130 NumUniqueGroups += 1;
4131 }
4132 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4133 };
4134 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4135 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4136 if (!SV)
4137 return C;
4138 SmallVector<int, 16> Mask;
4139 SV->getShuffleMask(Mask);
4140 return AddShuffleMaskAdjustedCost(C, Mask);
4141 };
4142 // Check that input consists of ShuffleVectors applied to the same input
4143 auto AllShufflesHaveSameOperands =
4144 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4145 if (InputShuffles.size() < 2)
4146 return false;
4147 ShuffleVectorInst *FirstSV =
4148 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4149 if (!FirstSV)
4150 return false;
4151
4152 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4153 return std::all_of(
4154 std::next(InputShuffles.begin()), InputShuffles.end(),
4155 [&](Instruction *I) {
4156 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4157 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4158 });
4159 };
4160
4161 // Get the costs of the shuffles + binops before and after with the new
4162 // shuffle masks.
4163 InstructionCost CostBefore =
4164 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4165 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4166 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4167 InstructionCost(0), AddShuffleCost);
4168 if (AllShufflesHaveSameOperands(InputShuffles)) {
4169 UniqueShuffles.clear();
4170 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4171 InstructionCost(0), AddShuffleAdjustedCost);
4172 } else {
4173 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4174 InstructionCost(0), AddShuffleCost);
4175 }
4176
4177 // The new binops will be unused for lanes past the used shuffle lengths.
4178 // These types attempt to get the correct cost for that from the target.
4179 FixedVectorType *Op0SmallVT =
4180 FixedVectorType::get(VT->getScalarType(), V1.size());
4181 FixedVectorType *Op1SmallVT =
4182 FixedVectorType::get(VT->getScalarType(), V2.size());
4183 InstructionCost CostAfter =
4184 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4185 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4186 UniqueShuffles.clear();
4187 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4188 InstructionCost(0), AddShuffleMaskAdjustedCost);
4189 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4190 CostAfter +=
4191 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4192 InstructionCost(0), AddShuffleMaskCost);
4193
4194 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4195 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4196 << " vs CostAfter: " << CostAfter << "\n");
4197 if (CostBefore < CostAfter ||
4198 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4199 return false;
4200
4201 // The cost model has passed, create the new instructions.
4202 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4203 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4204 if (!SV)
4205 return I;
4206 if (isa<UndefValue>(SV->getOperand(1)))
4207 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4208 if (InputShuffles.contains(SSV))
4209 return SSV->getOperand(Op);
4210 return SV->getOperand(Op);
4211 };
4212 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4213 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4214 GetShuffleOperand(SVI0A, 1), V1A);
4215 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4216 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4217 GetShuffleOperand(SVI0B, 1), V1B);
4218 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4219 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4220 GetShuffleOperand(SVI1A, 1), V2A);
4221 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4222 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4223 GetShuffleOperand(SVI1B, 1), V2B);
4224 Builder.SetInsertPoint(Op0);
4225 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4226 NSV0A, NSV0B);
4227 if (auto *I = dyn_cast<Instruction>(NOp0))
4228 I->copyIRFlags(Op0, true);
4229 Builder.SetInsertPoint(Op1);
4230 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4231 NSV1A, NSV1B);
4232 if (auto *I = dyn_cast<Instruction>(NOp1))
4233 I->copyIRFlags(Op1, true);
4234
4235 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4236 Builder.SetInsertPoint(Shuffles[S]);
4237 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4238 replaceValue(*Shuffles[S], *NSV, false);
4239 }
4240
4241 Worklist.pushValue(NSV0A);
4242 Worklist.pushValue(NSV0B);
4243 Worklist.pushValue(NSV1A);
4244 Worklist.pushValue(NSV1B);
4245 return true;
4246}
4247
4248/// Check if instruction depends on ZExt and this ZExt can be moved after the
4249/// instruction. Move ZExt if it is profitable. For example:
4250/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4251/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4252/// Cost model calculations takes into account if zext(x) has other users and
4253/// whether it can be propagated through them too.
4254bool VectorCombine::shrinkType(Instruction &I) {
4255 Value *ZExted, *OtherOperand;
4256 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4257 m_Value(OtherOperand))) &&
4258 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4259 return false;
4260
4261 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4262
4263 auto *BigTy = cast<FixedVectorType>(I.getType());
4264 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4265 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4266
4267 if (I.getOpcode() == Instruction::LShr) {
4268 // Check that the shift amount is less than the number of bits in the
4269 // smaller type. Otherwise, the smaller lshr will return a poison value.
4270 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4271 if (ShAmtKB.getMaxValue().uge(BW))
4272 return false;
4273 } else {
4274 // Check that the expression overall uses at most the same number of bits as
4275 // ZExted
4276 KnownBits KB = computeKnownBits(&I, *DL);
4277 if (KB.countMaxActiveBits() > BW)
4278 return false;
4279 }
4280
4281 // Calculate costs of leaving current IR as it is and moving ZExt operation
4282 // later, along with adding truncates if needed
4284 Instruction::ZExt, BigTy, SmallTy,
4285 TargetTransformInfo::CastContextHint::None, CostKind);
4286 InstructionCost CurrentCost = ZExtCost;
4287 InstructionCost ShrinkCost = 0;
4288
4289 // Calculate total cost and check that we can propagate through all ZExt users
4290 for (User *U : ZExtOperand->users()) {
4291 auto *UI = cast<Instruction>(U);
4292 if (UI == &I) {
4293 CurrentCost +=
4294 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4295 ShrinkCost +=
4296 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4297 ShrinkCost += ZExtCost;
4298 continue;
4299 }
4300
4301 if (!Instruction::isBinaryOp(UI->getOpcode()))
4302 return false;
4303
4304 // Check if we can propagate ZExt through its other users
4305 KnownBits KB = computeKnownBits(UI, *DL);
4306 if (KB.countMaxActiveBits() > BW)
4307 return false;
4308
4309 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4310 ShrinkCost +=
4311 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4312 ShrinkCost += ZExtCost;
4313 }
4314
4315 // If the other instruction operand is not a constant, we'll need to
4316 // generate a truncate instruction. So we have to adjust cost
4317 if (!isa<Constant>(OtherOperand))
4318 ShrinkCost += TTI.getCastInstrCost(
4319 Instruction::Trunc, SmallTy, BigTy,
4320 TargetTransformInfo::CastContextHint::None, CostKind);
4321
4322 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4323 // towards modifying the IR because shrinking opens opportunities for other
4324 // shrinking optimisations.
4325 if (ShrinkCost > CurrentCost)
4326 return false;
4327
4328 Builder.SetInsertPoint(&I);
4329 Value *Op0 = ZExted;
4330 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4331 // Keep the order of operands the same
4332 if (I.getOperand(0) == OtherOperand)
4333 std::swap(Op0, Op1);
4334 Value *NewBinOp =
4335 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4336 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4337 cast<Instruction>(NewBinOp)->copyMetadata(I);
4338 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4339 replaceValue(I, *NewZExtr);
4340 return true;
4341}
4342
4343/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4344/// shuffle (DstVec, SrcVec, Mask)
4345bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4346 Value *DstVec, *SrcVec;
4347 uint64_t ExtIdx, InsIdx;
4348 if (!match(&I,
4349 m_InsertElt(m_Value(DstVec),
4350 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4351 m_ConstantInt(InsIdx))))
4352 return false;
4353
4354 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4355 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4356 // We can try combining vectors with different element sizes.
4357 if (!DstVecTy || !SrcVecTy ||
4358 SrcVecTy->getElementType() != DstVecTy->getElementType())
4359 return false;
4360
4361 unsigned NumDstElts = DstVecTy->getNumElements();
4362 unsigned NumSrcElts = SrcVecTy->getNumElements();
4363 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4364 return false;
4365
4366 // Insertion into poison is a cheaper single operand shuffle.
4368 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4369
4370 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4371 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4372 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4373 if (NeedDstSrcSwap) {
4375 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4376 Mask[InsIdx] = 0;
4377 else
4378 Mask[InsIdx] = ExtIdx;
4379 std::swap(DstVec, SrcVec);
4380 } else {
4382 std::iota(Mask.begin(), Mask.end(), 0);
4383 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4384 Mask[InsIdx] = NumDstElts;
4385 else
4386 Mask[InsIdx] = ExtIdx + NumDstElts;
4387 }
4388
4389 // Cost
4390 auto *Ins = cast<InsertElementInst>(&I);
4391 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4392 InstructionCost InsCost =
4393 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4394 InstructionCost ExtCost =
4395 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4396 InstructionCost OldCost = ExtCost + InsCost;
4397
4398 InstructionCost NewCost = 0;
4399 SmallVector<int> ExtToVecMask;
4400 if (!NeedExpOrNarrow) {
4401 // Ignore 'free' identity insertion shuffle.
4402 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4403 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4404 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4405 nullptr, {DstVec, SrcVec});
4406 } else {
4407 // When creating length-changing-vector, always create with a Mask whose
4408 // first element has an ExtIdx, so that the first element of the vector
4409 // being created is always the target to be extracted.
4410 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4411 if (IsExtIdxInBounds)
4412 ExtToVecMask[ExtIdx] = ExtIdx;
4413 else
4414 ExtToVecMask[0] = ExtIdx;
4415 // Add cost for expanding or narrowing
4417 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4418 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4419 }
4420
4421 if (!Ext->hasOneUse())
4422 NewCost += ExtCost;
4423
4424 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4425 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4426 << "\n");
4427
4428 if (OldCost < NewCost)
4429 return false;
4430
4431 if (NeedExpOrNarrow) {
4432 if (!NeedDstSrcSwap)
4433 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4434 else
4435 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4436 }
4437
4438 // Canonicalize undef param to RHS to help further folds.
4439 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4440 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4441 std::swap(DstVec, SrcVec);
4442 }
4443
4444 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4445 replaceValue(I, *Shuf);
4446
4447 return true;
4448}
4449
4450/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4451/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4452/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4453/// before casting it back into `<vscale x 16 x i32>`.
4454bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4455 const APInt *SplatVal0, *SplatVal1;
4457 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4458 return false;
4459
4460 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4461 << "\n");
4462
4463 auto *VTy =
4464 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4465 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4466 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4467
4468 // Just in case the cost of interleave2 intrinsic and bitcast are both
4469 // invalid, in which case we want to bail out, we use <= rather
4470 // than < here. Even they both have valid and equal costs, it's probably
4471 // not a good idea to emit a high-cost constant splat.
4473 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4475 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4476 << *I.getType() << " is too high.\n");
4477 return false;
4478 }
4479
4480 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4481 NewSplatVal <<= Width;
4482 NewSplatVal |= SplatVal0->zext(Width * 2);
4483 auto *NewSplat = ConstantVector::getSplat(
4484 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4485
4486 IRBuilder<> Builder(&I);
4487 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4488 return true;
4489}
4490
4491// Attempt to shrink loads that are only used by shufflevector instructions.
4492bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4493 auto *OldLoad = dyn_cast<LoadInst>(&I);
4494 if (!OldLoad || !OldLoad->isSimple())
4495 return false;
4496
4497 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4498 if (!OldLoadTy)
4499 return false;
4500
4501 unsigned const OldNumElements = OldLoadTy->getNumElements();
4502
4503 // Search all uses of load. If all uses are shufflevector instructions, and
4504 // the second operands are all poison values, find the minimum and maximum
4505 // indices of the vector elements referenced by all shuffle masks.
4506 // Otherwise return `std::nullopt`.
4507 using IndexRange = std::pair<int, int>;
4508 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4509 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4510 for (llvm::Use &Use : I.uses()) {
4511 // Ensure all uses match the required pattern.
4512 User *Shuffle = Use.getUser();
4513 ArrayRef<int> Mask;
4514
4515 if (!match(Shuffle,
4516 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4517 return std::nullopt;
4518
4519 // Ignore shufflevector instructions that have no uses.
4520 if (Shuffle->use_empty())
4521 continue;
4522
4523 // Find the min and max indices used by the shufflevector instruction.
4524 for (int Index : Mask) {
4525 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4526 OutputRange.first = std::min(Index, OutputRange.first);
4527 OutputRange.second = std::max(Index, OutputRange.second);
4528 }
4529 }
4530 }
4531
4532 if (OutputRange.second < OutputRange.first)
4533 return std::nullopt;
4534
4535 return OutputRange;
4536 };
4537
4538 // Get the range of vector elements used by shufflevector instructions.
4539 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4540 unsigned const NewNumElements = Indices->second + 1u;
4541
4542 // If the range of vector elements is smaller than the full load, attempt
4543 // to create a smaller load.
4544 if (NewNumElements < OldNumElements) {
4545 IRBuilder Builder(&I);
4546 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4547
4548 // Calculate costs of old and new ops.
4549 Type *ElemTy = OldLoadTy->getElementType();
4550 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4551 Value *PtrOp = OldLoad->getPointerOperand();
4552
4554 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4555 OldLoad->getPointerAddressSpace(), CostKind);
4556 InstructionCost NewCost =
4557 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4558 OldLoad->getPointerAddressSpace(), CostKind);
4559
4560 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4562 unsigned const MaxIndex = NewNumElements * 2u;
4563
4564 for (llvm::Use &Use : I.uses()) {
4565 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4566 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4567
4568 // Create entry for new use.
4569 NewUses.push_back({Shuffle, OldMask});
4570
4571 // Validate mask indices.
4572 for (int Index : OldMask) {
4573 if (Index >= static_cast<int>(MaxIndex))
4574 return false;
4575 }
4576
4577 // Update costs.
4578 OldCost +=
4580 OldLoadTy, OldMask, CostKind);
4581 NewCost +=
4583 NewLoadTy, OldMask, CostKind);
4584 }
4585
4586 LLVM_DEBUG(
4587 dbgs() << "Found a load used only by shufflevector instructions: "
4588 << I << "\n OldCost: " << OldCost
4589 << " vs NewCost: " << NewCost << "\n");
4590
4591 if (OldCost < NewCost || !NewCost.isValid())
4592 return false;
4593
4594 // Create new load of smaller vector.
4595 auto *NewLoad = cast<LoadInst>(
4596 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4597 NewLoad->copyMetadata(I);
4598
4599 // Replace all uses.
4600 for (UseEntry &Use : NewUses) {
4601 ShuffleVectorInst *Shuffle = Use.first;
4602 std::vector<int> &NewMask = Use.second;
4603
4604 Builder.SetInsertPoint(Shuffle);
4605 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4606 Value *NewShuffle = Builder.CreateShuffleVector(
4607 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4608
4609 replaceValue(*Shuffle, *NewShuffle, false);
4610 }
4611
4612 return true;
4613 }
4614 }
4615 return false;
4616}
4617
4618// Attempt to narrow a phi of shufflevector instructions where the two incoming
4619// values have the same operands but different masks. If the two shuffle masks
4620// are offsets of one another we can use one branch to rotate the incoming
4621// vector and perform one larger shuffle after the phi.
4622bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4623 auto *Phi = dyn_cast<PHINode>(&I);
4624 if (!Phi || Phi->getNumIncomingValues() != 2u)
4625 return false;
4626
4627 Value *Op = nullptr;
4628 ArrayRef<int> Mask0;
4629 ArrayRef<int> Mask1;
4630
4631 if (!match(Phi->getOperand(0u),
4632 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4633 !match(Phi->getOperand(1u),
4634 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4635 return false;
4636
4637 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4638
4639 // Ensure result vectors are wider than the argument vector.
4640 auto *InputVT = cast<FixedVectorType>(Op->getType());
4641 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4642 auto const InputNumElements = InputVT->getNumElements();
4643
4644 if (InputNumElements >= ResultVT->getNumElements())
4645 return false;
4646
4647 // Take the difference of the two shuffle masks at each index. Ignore poison
4648 // values at the same index in both masks.
4649 SmallVector<int, 16> NewMask;
4650 NewMask.reserve(Mask0.size());
4651
4652 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4653 if (M0 >= 0 && M1 >= 0)
4654 NewMask.push_back(M0 - M1);
4655 else if (M0 == -1 && M1 == -1)
4656 continue;
4657 else
4658 return false;
4659 }
4660
4661 // Ensure all elements of the new mask are equal. If the difference between
4662 // the incoming mask elements is the same, the two must be constant offsets
4663 // of one another.
4664 if (NewMask.empty() || !all_equal(NewMask))
4665 return false;
4666
4667 // Create new mask using difference of the two incoming masks.
4668 int MaskOffset = NewMask[0u];
4669 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4670 NewMask.clear();
4671
4672 for (unsigned I = 0u; I < InputNumElements; ++I) {
4673 NewMask.push_back(Index);
4674 Index = (Index + 1u) % InputNumElements;
4675 }
4676
4677 // Calculate costs for worst cases and compare.
4678 auto const Kind = TTI::SK_PermuteSingleSrc;
4679 auto OldCost =
4680 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4681 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4682 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4683 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4684
4685 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4686 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4687 << "\n");
4688
4689 if (NewCost > OldCost)
4690 return false;
4691
4692 // Create new shuffles and narrowed phi.
4693 auto Builder = IRBuilder(Shuf);
4694 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4695 auto *PoisonVal = PoisonValue::get(InputVT);
4696 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4697 Worklist.push(cast<Instruction>(NewShuf0));
4698
4699 Builder.SetInsertPoint(Phi);
4700 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4701 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4702 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4703 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4704
4705 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4706 PoisonVal = PoisonValue::get(NewPhi->getType());
4707 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4708
4709 replaceValue(*Phi, *NewShuf1);
4710 return true;
4711}
4712
4713/// This is the entry point for all transforms. Pass manager differences are
4714/// handled in the callers of this function.
4715bool VectorCombine::run() {
4717 return false;
4718
4719 // Don't attempt vectorization if the target does not support vectors.
4720 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4721 return false;
4722
4723 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4724
4725 auto FoldInst = [this](Instruction &I) {
4726 Builder.SetInsertPoint(&I);
4727 bool IsVectorType = isa<VectorType>(I.getType());
4728 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4729 auto Opcode = I.getOpcode();
4730
4731 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4732
4733 // These folds should be beneficial regardless of when this pass is run
4734 // in the optimization pipeline.
4735 // The type checking is for run-time efficiency. We can avoid wasting time
4736 // dispatching to folding functions if there's no chance of matching.
4737 if (IsFixedVectorType) {
4738 switch (Opcode) {
4739 case Instruction::InsertElement:
4740 if (vectorizeLoadInsert(I))
4741 return true;
4742 break;
4743 case Instruction::ShuffleVector:
4744 if (widenSubvectorLoad(I))
4745 return true;
4746 break;
4747 default:
4748 break;
4749 }
4750 }
4751
4752 // This transform works with scalable and fixed vectors
4753 // TODO: Identify and allow other scalable transforms
4754 if (IsVectorType) {
4755 if (scalarizeOpOrCmp(I))
4756 return true;
4757 if (scalarizeLoad(I))
4758 return true;
4759 if (scalarizeExtExtract(I))
4760 return true;
4761 if (scalarizeVPIntrinsic(I))
4762 return true;
4763 if (foldInterleaveIntrinsics(I))
4764 return true;
4765 }
4766
4767 if (Opcode == Instruction::Store)
4768 if (foldSingleElementStore(I))
4769 return true;
4770
4771 // If this is an early pipeline invocation of this pass, we are done.
4772 if (TryEarlyFoldsOnly)
4773 return false;
4774
4775 // Otherwise, try folds that improve codegen but may interfere with
4776 // early IR canonicalizations.
4777 // The type checking is for run-time efficiency. We can avoid wasting time
4778 // dispatching to folding functions if there's no chance of matching.
4779 if (IsFixedVectorType) {
4780 switch (Opcode) {
4781 case Instruction::InsertElement:
4782 if (foldInsExtFNeg(I))
4783 return true;
4784 if (foldInsExtBinop(I))
4785 return true;
4786 if (foldInsExtVectorToShuffle(I))
4787 return true;
4788 break;
4789 case Instruction::ShuffleVector:
4790 if (foldPermuteOfBinops(I))
4791 return true;
4792 if (foldShuffleOfBinops(I))
4793 return true;
4794 if (foldShuffleOfSelects(I))
4795 return true;
4796 if (foldShuffleOfCastops(I))
4797 return true;
4798 if (foldShuffleOfShuffles(I))
4799 return true;
4800 if (foldPermuteOfIntrinsic(I))
4801 return true;
4802 if (foldShuffleOfIntrinsics(I))
4803 return true;
4804 if (foldSelectShuffle(I))
4805 return true;
4806 if (foldShuffleToIdentity(I))
4807 return true;
4808 break;
4809 case Instruction::Load:
4810 if (shrinkLoadForShuffles(I))
4811 return true;
4812 break;
4813 case Instruction::BitCast:
4814 if (foldBitcastShuffle(I))
4815 return true;
4816 break;
4817 case Instruction::And:
4818 case Instruction::Or:
4819 case Instruction::Xor:
4820 if (foldBitOpOfCastops(I))
4821 return true;
4822 if (foldBitOpOfCastConstant(I))
4823 return true;
4824 break;
4825 case Instruction::PHI:
4826 if (shrinkPhiOfShuffles(I))
4827 return true;
4828 break;
4829 default:
4830 if (shrinkType(I))
4831 return true;
4832 break;
4833 }
4834 } else {
4835 switch (Opcode) {
4836 case Instruction::Call:
4837 if (foldShuffleFromReductions(I))
4838 return true;
4839 if (foldCastFromReductions(I))
4840 return true;
4841 break;
4842 case Instruction::ExtractElement:
4843 if (foldShuffleChainsToReduce(I))
4844 return true;
4845 break;
4846 case Instruction::ICmp:
4847 case Instruction::FCmp:
4848 if (foldExtractExtract(I))
4849 return true;
4850 break;
4851 case Instruction::Or:
4852 if (foldConcatOfBoolMasks(I))
4853 return true;
4854 [[fallthrough]];
4855 default:
4856 if (Instruction::isBinaryOp(Opcode)) {
4857 if (foldExtractExtract(I))
4858 return true;
4859 if (foldExtractedCmps(I))
4860 return true;
4861 if (foldBinopOfReductions(I))
4862 return true;
4863 }
4864 break;
4865 }
4866 }
4867 return false;
4868 };
4869
4870 bool MadeChange = false;
4871 for (BasicBlock &BB : F) {
4872 // Ignore unreachable basic blocks.
4873 if (!DT.isReachableFromEntry(&BB))
4874 continue;
4875 // Use early increment range so that we can erase instructions in loop.
4876 // make_early_inc_range is not applicable here, as the next iterator may
4877 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4878 // We manually maintain the next instruction and update it when it is about
4879 // to be deleted.
4880 Instruction *I = &BB.front();
4881 while (I) {
4882 NextInst = I->getNextNode();
4883 if (!I->isDebugOrPseudoInst())
4884 MadeChange |= FoldInst(*I);
4885 I = NextInst;
4886 }
4887 }
4888
4889 NextInst = nullptr;
4890
4891 while (!Worklist.isEmpty()) {
4892 Instruction *I = Worklist.removeOne();
4893 if (!I)
4894 continue;
4895
4898 continue;
4899 }
4900
4901 MadeChange |= FoldInst(*I);
4902 }
4903
4904 return MadeChange;
4905}
4906
4909 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4911 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4912 AAResults &AA = FAM.getResult<AAManager>(F);
4913 const DataLayout *DL = &F.getDataLayout();
4914 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4915 TryEarlyFoldsOnly);
4916 if (!Combiner.run())
4917 return PreservedAnalyses::all();
4920 return PA;
4921}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1909
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2120
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145