LLVM 19.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/Support/Debug.h"
25
26using namespace llvm;
27
28#define DEBUG_TYPE "systemztti"
29
30//===----------------------------------------------------------------------===//
31//
32// SystemZ cost model.
33//
34//===----------------------------------------------------------------------===//
35
36static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
37 bool UsedAsMemCpySource = false;
38 for (const User *U : V->users())
39 if (const Instruction *User = dyn_cast<Instruction>(U)) {
40 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
41 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
42 continue;
43 }
44 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
45 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
46 UsedAsMemCpySource = true;
47 continue;
48 }
49 }
50 OtherUse = true;
51 }
52 return UsedAsMemCpySource;
53}
54
56 unsigned Bonus = 0;
57
58 // Increase the threshold if an incoming argument is used only as a memcpy
59 // source.
60 if (Function *Callee = CB->getCalledFunction())
61 for (Argument &Arg : Callee->args()) {
62 bool OtherUse = false;
63 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
64 Bonus += 150;
65 }
66
67 LLVM_DEBUG(if (Bonus)
68 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
69 return Bonus;
70}
71
74 assert(Ty->isIntegerTy());
75
76 unsigned BitSize = Ty->getPrimitiveSizeInBits();
77 // There is no cost model for constants with a bit size of 0. Return TCC_Free
78 // here, so that constant hoisting will ignore this constant.
79 if (BitSize == 0)
80 return TTI::TCC_Free;
81 // No cost model for operations on integers larger than 128 bit implemented yet.
82 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
83 return TTI::TCC_Free;
84
85 if (Imm == 0)
86 return TTI::TCC_Free;
87
88 if (Imm.getBitWidth() <= 64) {
89 // Constants loaded via lgfi.
90 if (isInt<32>(Imm.getSExtValue()))
91 return TTI::TCC_Basic;
92 // Constants loaded via llilf.
93 if (isUInt<32>(Imm.getZExtValue()))
94 return TTI::TCC_Basic;
95 // Constants loaded via llihf:
96 if ((Imm.getZExtValue() & 0xffffffff) == 0)
97 return TTI::TCC_Basic;
98
99 return 2 * TTI::TCC_Basic;
100 }
101
102 // i128 immediates loads from Constant Pool
103 return 2 * TTI::TCC_Basic;
104}
105
107 const APInt &Imm, Type *Ty,
109 Instruction *Inst) {
110 assert(Ty->isIntegerTy());
111
112 unsigned BitSize = Ty->getPrimitiveSizeInBits();
113 // There is no cost model for constants with a bit size of 0. Return TCC_Free
114 // here, so that constant hoisting will ignore this constant.
115 if (BitSize == 0)
116 return TTI::TCC_Free;
117 // No cost model for operations on integers larger than 64 bit implemented yet.
118 if (BitSize > 64)
119 return TTI::TCC_Free;
120
121 switch (Opcode) {
122 default:
123 return TTI::TCC_Free;
124 case Instruction::GetElementPtr:
125 // Always hoist the base address of a GetElementPtr. This prevents the
126 // creation of new constants for every base constant that gets constant
127 // folded with the offset.
128 if (Idx == 0)
129 return 2 * TTI::TCC_Basic;
130 return TTI::TCC_Free;
131 case Instruction::Store:
132 if (Idx == 0 && Imm.getBitWidth() <= 64) {
133 // Any 8-bit immediate store can by implemented via mvi.
134 if (BitSize == 8)
135 return TTI::TCC_Free;
136 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
137 if (isInt<16>(Imm.getSExtValue()))
138 return TTI::TCC_Free;
139 }
140 break;
141 case Instruction::ICmp:
142 if (Idx == 1 && Imm.getBitWidth() <= 64) {
143 // Comparisons against signed 32-bit immediates implemented via cgfi.
144 if (isInt<32>(Imm.getSExtValue()))
145 return TTI::TCC_Free;
146 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
147 if (isUInt<32>(Imm.getZExtValue()))
148 return TTI::TCC_Free;
149 }
150 break;
151 case Instruction::Add:
152 case Instruction::Sub:
153 if (Idx == 1 && Imm.getBitWidth() <= 64) {
154 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
155 if (isUInt<32>(Imm.getZExtValue()))
156 return TTI::TCC_Free;
157 // Or their negation, by swapping addition vs. subtraction.
158 if (isUInt<32>(-Imm.getSExtValue()))
159 return TTI::TCC_Free;
160 }
161 break;
162 case Instruction::Mul:
163 if (Idx == 1 && Imm.getBitWidth() <= 64) {
164 // We use msgfi to multiply by 32-bit signed immediates.
165 if (isInt<32>(Imm.getSExtValue()))
166 return TTI::TCC_Free;
167 }
168 break;
169 case Instruction::Or:
170 case Instruction::Xor:
171 if (Idx == 1 && Imm.getBitWidth() <= 64) {
172 // Masks supported by oilf/xilf.
173 if (isUInt<32>(Imm.getZExtValue()))
174 return TTI::TCC_Free;
175 // Masks supported by oihf/xihf.
176 if ((Imm.getZExtValue() & 0xffffffff) == 0)
177 return TTI::TCC_Free;
178 }
179 break;
180 case Instruction::And:
181 if (Idx == 1 && Imm.getBitWidth() <= 64) {
182 // Any 32-bit AND operation can by implemented via nilf.
183 if (BitSize <= 32)
184 return TTI::TCC_Free;
185 // 64-bit masks supported by nilf.
186 if (isUInt<32>(~Imm.getZExtValue()))
187 return TTI::TCC_Free;
188 // 64-bit masks supported by nilh.
189 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
190 return TTI::TCC_Free;
191 // Some 64-bit AND operations can be implemented via risbg.
192 const SystemZInstrInfo *TII = ST->getInstrInfo();
193 unsigned Start, End;
194 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
195 return TTI::TCC_Free;
196 }
197 break;
198 case Instruction::Shl:
199 case Instruction::LShr:
200 case Instruction::AShr:
201 // Always return TCC_Free for the shift value of a shift instruction.
202 if (Idx == 1)
203 return TTI::TCC_Free;
204 break;
205 case Instruction::UDiv:
206 case Instruction::SDiv:
207 case Instruction::URem:
208 case Instruction::SRem:
209 case Instruction::Trunc:
210 case Instruction::ZExt:
211 case Instruction::SExt:
212 case Instruction::IntToPtr:
213 case Instruction::PtrToInt:
214 case Instruction::BitCast:
215 case Instruction::PHI:
216 case Instruction::Call:
217 case Instruction::Select:
218 case Instruction::Ret:
219 case Instruction::Load:
220 break;
221 }
222
224}
225
228 const APInt &Imm, Type *Ty,
230 assert(Ty->isIntegerTy());
231
232 unsigned BitSize = Ty->getPrimitiveSizeInBits();
233 // There is no cost model for constants with a bit size of 0. Return TCC_Free
234 // here, so that constant hoisting will ignore this constant.
235 if (BitSize == 0)
236 return TTI::TCC_Free;
237 // No cost model for operations on integers larger than 64 bit implemented yet.
238 if (BitSize > 64)
239 return TTI::TCC_Free;
240
241 switch (IID) {
242 default:
243 return TTI::TCC_Free;
244 case Intrinsic::sadd_with_overflow:
245 case Intrinsic::uadd_with_overflow:
246 case Intrinsic::ssub_with_overflow:
247 case Intrinsic::usub_with_overflow:
248 // These get expanded to include a normal addition/subtraction.
249 if (Idx == 1 && Imm.getBitWidth() <= 64) {
250 if (isUInt<32>(Imm.getZExtValue()))
251 return TTI::TCC_Free;
252 if (isUInt<32>(-Imm.getSExtValue()))
253 return TTI::TCC_Free;
254 }
255 break;
256 case Intrinsic::smul_with_overflow:
257 case Intrinsic::umul_with_overflow:
258 // These get expanded to include a normal multiplication.
259 if (Idx == 1 && Imm.getBitWidth() <= 64) {
260 if (isInt<32>(Imm.getSExtValue()))
261 return TTI::TCC_Free;
262 }
263 break;
264 case Intrinsic::experimental_stackmap:
265 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
266 return TTI::TCC_Free;
267 break;
268 case Intrinsic::experimental_patchpoint_void:
269 case Intrinsic::experimental_patchpoint:
270 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
271 return TTI::TCC_Free;
272 break;
273 }
275}
276
279 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
280 if (ST->hasPopulationCount() && TyWidth <= 64)
282 return TTI::PSK_Software;
283}
284
288 // Find out if L contains a call, what the machine instruction count
289 // estimate is, and how many stores there are.
290 bool HasCall = false;
291 InstructionCost NumStores = 0;
292 for (auto &BB : L->blocks())
293 for (auto &I : *BB) {
294 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
295 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
296 if (isLoweredToCall(F))
297 HasCall = true;
298 if (F->getIntrinsicID() == Intrinsic::memcpy ||
299 F->getIntrinsicID() == Intrinsic::memset)
300 NumStores++;
301 } else { // indirect call.
302 HasCall = true;
303 }
304 }
305 if (isa<StoreInst>(&I)) {
306 Type *MemAccessTy = I.getOperand(0)->getType();
307 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
308 std::nullopt, 0, TTI::TCK_RecipThroughput);
309 }
310 }
311
312 // The z13 processor will run out of store tags if too many stores
313 // are fed into it too quickly. Therefore make sure there are not
314 // too many stores in the resulting unrolled loop.
315 unsigned const NumStoresVal = *NumStores.getValue();
316 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
317
318 if (HasCall) {
319 // Only allow full unrolling if loop has any calls.
320 UP.FullUnrollMaxCount = Max;
321 UP.MaxCount = 1;
322 return;
323 }
324
325 UP.MaxCount = Max;
326 if (UP.MaxCount <= 1)
327 return;
328
329 // Allow partial and runtime trip count unrolling.
330 UP.Partial = UP.Runtime = true;
331
332 UP.PartialThreshold = 75;
334
335 // Allow expensive instructions in the pre-header of the loop.
336 UP.AllowExpensiveTripCount = true;
337
338 UP.Force = true;
339}
340
344}
345
348 // SystemZ specific: check instruction count (first), and don't care about
349 // ImmCost, since offsets are checked explicitly.
350 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
351 C1.NumIVMuls, C1.NumBaseAdds,
352 C1.ScaleCost, C1.SetupCost) <
353 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
354 C2.NumIVMuls, C2.NumBaseAdds,
355 C2.ScaleCost, C2.SetupCost);
356}
357
358unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
359 bool Vector = (ClassID == 1);
360 if (!Vector)
361 // Discount the stack pointer. Also leave out %r0, since it can't
362 // be used in an address.
363 return 14;
364 if (ST->hasVector())
365 return 32;
366 return 0;
367}
368
371 switch (K) {
373 return TypeSize::getFixed(64);
375 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
377 return TypeSize::getScalable(0);
378 }
379
380 llvm_unreachable("Unsupported register kind");
381}
382
383unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
384 unsigned NumStridedMemAccesses,
385 unsigned NumPrefetches,
386 bool HasCall) const {
387 // Don't prefetch a loop with many far apart accesses.
388 if (NumPrefetches > 16)
389 return UINT_MAX;
390
391 // Emit prefetch instructions for smaller strides in cases where we think
392 // the hardware prefetcher might not be able to keep up.
393 if (NumStridedMemAccesses > 32 && !HasCall &&
394 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
395 return 1;
396
397 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
398}
399
400bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
401 EVT VT = TLI->getValueType(DL, DataType);
402 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
403}
404
405// Return the bit size for the scalar type or vector element
406// type. getScalarSizeInBits() returns 0 for a pointer type.
407static unsigned getScalarSizeInBits(Type *Ty) {
408 unsigned Size =
409 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
410 assert(Size > 0 && "Element must have non-zero size.");
411 return Size;
412}
413
414// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
415// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
416// 3.
417static unsigned getNumVectorRegs(Type *Ty) {
418 auto *VTy = cast<FixedVectorType>(Ty);
419 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
420 assert(WideBits > 0 && "Could not compute size of vector");
421 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
422}
423
425 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
428 const Instruction *CxtI) {
429
430 // TODO: Handle more cost kinds.
432 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
433 Op2Info, Args, CxtI);
434
435 // TODO: return a good value for BB-VECTORIZER that includes the
436 // immediate loads, which we do not want to count for the loop
437 // vectorizer, since they are hopefully hoisted out of the loop. This
438 // would require a new parameter 'InLoop', but not sure if constant
439 // args are common enough to motivate this.
440
441 unsigned ScalarBits = Ty->getScalarSizeInBits();
442
443 // There are thre cases of division and remainder: Dividing with a register
444 // needs a divide instruction. A divisor which is a power of two constant
445 // can be implemented with a sequence of shifts. Any other constant needs a
446 // multiply and shifts.
447 const unsigned DivInstrCost = 20;
448 const unsigned DivMulSeqCost = 10;
449 const unsigned SDivPow2Cost = 4;
450
451 bool SignedDivRem =
452 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
453 bool UnsignedDivRem =
454 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
455
456 // Check for a constant divisor.
457 bool DivRemConst = false;
458 bool DivRemConstPow2 = false;
459 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
460 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
461 const ConstantInt *CVal =
462 (C->getType()->isVectorTy()
463 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
464 : dyn_cast<const ConstantInt>(C));
465 if (CVal && (CVal->getValue().isPowerOf2() ||
466 CVal->getValue().isNegatedPowerOf2()))
467 DivRemConstPow2 = true;
468 else
469 DivRemConst = true;
470 }
471 }
472
473 if (!Ty->isVectorTy()) {
474 // These FP operations are supported with a dedicated instruction for
475 // float, double and fp128 (base implementation assumes float generally
476 // costs 2).
477 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
478 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
479 return 1;
480
481 // There is no native support for FRem.
482 if (Opcode == Instruction::FRem)
483 return LIBCALL_COST;
484
485 // Give discount for some combined logical operations if supported.
486 if (Args.size() == 2) {
487 if (Opcode == Instruction::Xor) {
488 for (const Value *A : Args) {
489 if (const Instruction *I = dyn_cast<Instruction>(A))
490 if (I->hasOneUse() &&
491 (I->getOpcode() == Instruction::Or ||
492 I->getOpcode() == Instruction::And ||
493 I->getOpcode() == Instruction::Xor))
494 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
495 (isInt128InVR(Ty) &&
496 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
497 return 0;
498 }
499 }
500 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
501 for (const Value *A : Args) {
502 if (const Instruction *I = dyn_cast<Instruction>(A))
503 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
504 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
505 (isInt128InVR(Ty) &&
506 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
507 return 0;
508 }
509 }
510 }
511
512 // Or requires one instruction, although it has custom handling for i64.
513 if (Opcode == Instruction::Or)
514 return 1;
515
516 if (Opcode == Instruction::Xor && ScalarBits == 1) {
517 if (ST->hasLoadStoreOnCond2())
518 return 5; // 2 * (li 0; loc 1); xor
519 return 7; // 2 * ipm sequences ; xor ; shift ; compare
520 }
521
522 if (DivRemConstPow2)
523 return (SignedDivRem ? SDivPow2Cost : 1);
524 if (DivRemConst)
525 return DivMulSeqCost;
526 if (SignedDivRem || UnsignedDivRem)
527 return DivInstrCost;
528 }
529 else if (ST->hasVector()) {
530 auto *VTy = cast<FixedVectorType>(Ty);
531 unsigned VF = VTy->getNumElements();
532 unsigned NumVectors = getNumVectorRegs(Ty);
533
534 // These vector operations are custom handled, but are still supported
535 // with one instruction per vector, regardless of element size.
536 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
537 Opcode == Instruction::AShr) {
538 return NumVectors;
539 }
540
541 if (DivRemConstPow2)
542 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
543 if (DivRemConst) {
544 SmallVector<Type *> Tys(Args.size(), Ty);
545 return VF * DivMulSeqCost +
546 getScalarizationOverhead(VTy, Args, Tys, CostKind);
547 }
548 if ((SignedDivRem || UnsignedDivRem) && VF > 4)
549 // Temporary hack: disable high vectorization factors with integer
550 // division/remainder, which will get scalarized and handled with
551 // GR128 registers. The mischeduler is not clever enough to avoid
552 // spilling yet.
553 return 1000;
554
555 // These FP operations are supported with a single vector instruction for
556 // double (base implementation assumes float generally costs 2). For
557 // FP128, the scalar cost is 1, and there is no overhead since the values
558 // are already in scalar registers.
559 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
560 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
561 switch (ScalarBits) {
562 case 32: {
563 // The vector enhancements facility 1 provides v4f32 instructions.
564 if (ST->hasVectorEnhancements1())
565 return NumVectors;
566 // Return the cost of multiple scalar invocation plus the cost of
567 // inserting and extracting the values.
568 InstructionCost ScalarCost =
570 SmallVector<Type *> Tys(Args.size(), Ty);
572 (VF * ScalarCost) +
573 getScalarizationOverhead(VTy, Args, Tys, CostKind);
574 // FIXME: VF 2 for these FP operations are currently just as
575 // expensive as for VF 4.
576 if (VF == 2)
577 Cost *= 2;
578 return Cost;
579 }
580 case 64:
581 case 128:
582 return NumVectors;
583 default:
584 break;
585 }
586 }
587
588 // There is no native support for FRem.
589 if (Opcode == Instruction::FRem) {
590 SmallVector<Type *> Tys(Args.size(), Ty);
591 InstructionCost Cost = (VF * LIBCALL_COST) +
592 getScalarizationOverhead(VTy, Args, Tys, CostKind);
593 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
594 if (VF == 2 && ScalarBits == 32)
595 Cost *= 2;
596 return Cost;
597 }
598 }
599
600 // Fallback to the default implementation.
601 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
602 Args, CxtI);
603}
604
608 ArrayRef<const Value *> Args, const Instruction *CxtI) {
609 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
610 if (ST->hasVector()) {
611 unsigned NumVectors = getNumVectorRegs(Tp);
612
613 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
614
615 // FP128 values are always in scalar registers, so there is no work
616 // involved with a shuffle, except for broadcast. In that case register
617 // moves are done with a single instruction per element.
618 if (Tp->getScalarType()->isFP128Ty())
619 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
620
621 switch (Kind) {
623 // ExtractSubvector Index indicates start offset.
624
625 // Extracting a subvector from first index is a noop.
626 return (Index == 0 ? 0 : NumVectors);
627
629 // Loop vectorizer calls here to figure out the extra cost of
630 // broadcasting a loaded value to all elements of a vector. Since vlrep
631 // loads and replicates with a single instruction, adjust the returned
632 // value.
633 return NumVectors - 1;
634
635 default:
636
637 // SystemZ supports single instruction permutation / replication.
638 return NumVectors;
639 }
640 }
641
642 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
643}
644
645// Return the log2 difference of the element sizes of the two vector types.
646static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
647 unsigned Bits0 = Ty0->getScalarSizeInBits();
648 unsigned Bits1 = Ty1->getScalarSizeInBits();
649
650 if (Bits1 > Bits0)
651 return (Log2_32(Bits1) - Log2_32(Bits0));
652
653 return (Log2_32(Bits0) - Log2_32(Bits1));
654}
655
656// Return the number of instructions needed to truncate SrcTy to DstTy.
658getVectorTruncCost(Type *SrcTy, Type *DstTy) {
659 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
662 "Packing must reduce size of vector type.");
663 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
664 cast<FixedVectorType>(DstTy)->getNumElements() &&
665 "Packing should not change number of elements.");
666
667 // TODO: Since fp32 is expanded, the extract cost should always be 0.
668
669 unsigned NumParts = getNumVectorRegs(SrcTy);
670 if (NumParts <= 2)
671 // Up to 2 vector registers can be truncated efficiently with pack or
672 // permute. The latter requires an immediate mask to be loaded, which
673 // typically gets hoisted out of a loop. TODO: return a good value for
674 // BB-VECTORIZER that includes the immediate loads, which we do not want
675 // to count for the loop vectorizer.
676 return 1;
677
678 unsigned Cost = 0;
679 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
680 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
681 for (unsigned P = 0; P < Log2Diff; ++P) {
682 if (NumParts > 1)
683 NumParts /= 2;
684 Cost += NumParts;
685 }
686
687 // Currently, a general mix of permutes and pack instructions is output by
688 // isel, which follow the cost computation above except for this case which
689 // is one instruction less:
690 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
691 DstTy->getScalarSizeInBits() == 8)
692 Cost--;
693
694 return Cost;
695}
696
697// Return the cost of converting a vector bitmask produced by a compare
698// (SrcTy), to the type of the select or extend instruction (DstTy).
701 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
702 "Should only be called with vector types.");
703
704 unsigned PackCost = 0;
705 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
706 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
707 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
708 if (SrcScalarBits > DstScalarBits)
709 // The bitmask will be truncated.
710 PackCost = getVectorTruncCost(SrcTy, DstTy);
711 else if (SrcScalarBits < DstScalarBits) {
712 unsigned DstNumParts = getNumVectorRegs(DstTy);
713 // Each vector select needs its part of the bitmask unpacked.
714 PackCost = Log2Diff * DstNumParts;
715 // Extra cost for moving part of mask before unpacking.
716 PackCost += DstNumParts - 1;
717 }
718
719 return PackCost;
720}
721
722// Return the type of the compared operands. This is needed to compute the
723// cost for a Select / ZExt or SExt instruction.
724static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
725 Type *OpTy = nullptr;
726 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
727 OpTy = CI->getOperand(0)->getType();
728 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
729 if (LogicI->getNumOperands() == 2)
730 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
731 if (isa<CmpInst>(LogicI->getOperand(1)))
732 OpTy = CI0->getOperand(0)->getType();
733
734 if (OpTy != nullptr) {
735 if (VF == 1) {
736 assert (!OpTy->isVectorTy() && "Expected scalar type");
737 return OpTy;
738 }
739 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
740 // be either scalar or already vectorized with a same or lesser VF.
741 Type *ElTy = OpTy->getScalarType();
742 return FixedVectorType::get(ElTy, VF);
743 }
744
745 return nullptr;
746}
747
748// Get the cost of converting a boolean vector to a vector with same width
749// and element size as Dst, plus the cost of zero extending if needed.
751getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
752 const Instruction *I) {
753 auto *DstVTy = cast<FixedVectorType>(Dst);
754 unsigned VF = DstVTy->getNumElements();
755 unsigned Cost = 0;
756 // If we know what the widths of the compared operands, get any cost of
757 // converting it to match Dst. Otherwise assume same widths.
758 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
759 if (CmpOpTy != nullptr)
760 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
761 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
762 // One 'vn' per dst vector with an immediate mask.
763 Cost += getNumVectorRegs(Dst);
764 return Cost;
765}
766
768 Type *Src,
771 const Instruction *I) {
772 // FIXME: Can the logic below also be used for these cost kinds?
774 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
775 return BaseCost == 0 ? BaseCost : 1;
776 }
777
778 unsigned DstScalarBits = Dst->getScalarSizeInBits();
779 unsigned SrcScalarBits = Src->getScalarSizeInBits();
780
781 if (!Src->isVectorTy()) {
782 assert (!Dst->isVectorTy());
783
784 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
785 if (Src->isIntegerTy(128))
786 return LIBCALL_COST;
787 if (SrcScalarBits >= 32 ||
788 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
789 return 1;
790 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
791 }
792
793 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
794 Dst->isIntegerTy(128))
795 return LIBCALL_COST;
796
797 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
798 if (Src->isIntegerTy(1)) {
799 if (DstScalarBits == 128)
800 return 5 /*branch seq.*/;
801
802 if (ST->hasLoadStoreOnCond2())
803 return 2; // li 0; loc 1
804
805 // This should be extension of a compare i1 result, which is done with
806 // ipm and a varying sequence of instructions.
807 unsigned Cost = 0;
808 if (Opcode == Instruction::SExt)
809 Cost = (DstScalarBits < 64 ? 3 : 4);
810 if (Opcode == Instruction::ZExt)
811 Cost = 3;
812 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
813 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
814 // If operands of an fp-type was compared, this costs +1.
815 Cost++;
816 return Cost;
817 }
818 else if (isInt128InVR(Dst)) {
819 // Extensions from GPR to i128 (in VR) typically costs two instructions,
820 // but a zero-extending load would be just one extra instruction.
821 if (Opcode == Instruction::ZExt && I != nullptr)
822 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
823 if (Ld->hasOneUse())
824 return 1;
825 return 2;
826 }
827 }
828
829 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
830 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
831 if (Ld->hasOneUse())
832 return 0; // Will be converted to GPR load.
833 bool OnlyTruncatingStores = true;
834 for (const User *U : I->users())
835 if (!isa<StoreInst>(U)) {
836 OnlyTruncatingStores = false;
837 break;
838 }
839 if (OnlyTruncatingStores)
840 return 0;
841 return 2; // Vector element extraction.
842 }
843 }
844 else if (ST->hasVector()) {
845 // Vector to scalar cast.
846 auto *SrcVecTy = cast<FixedVectorType>(Src);
847 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
848 if (!DstVecTy) {
849 // TODO: tune vector-to-scalar cast.
850 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
851 }
852 unsigned VF = SrcVecTy->getNumElements();
853 unsigned NumDstVectors = getNumVectorRegs(Dst);
854 unsigned NumSrcVectors = getNumVectorRegs(Src);
855
856 if (Opcode == Instruction::Trunc) {
857 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
858 return 0; // Check for NOOP conversions.
859 return getVectorTruncCost(Src, Dst);
860 }
861
862 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
863 if (SrcScalarBits >= 8) {
864 // ZExt will use either a single unpack or a vector permute.
865 if (Opcode == Instruction::ZExt)
866 return NumDstVectors;
867
868 // SExt will be handled with one unpack per doubling of width.
869 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
870
871 // For types that spans multiple vector registers, some additional
872 // instructions are used to setup the unpacking.
873 unsigned NumSrcVectorOps =
874 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
875 : (NumDstVectors / 2));
876
877 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
878 }
879 else if (SrcScalarBits == 1)
880 return getBoolVecToIntConversionCost(Opcode, Dst, I);
881 }
882
883 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
884 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
885 // TODO: Fix base implementation which could simplify things a bit here
886 // (seems to miss on differentiating on scalar/vector types).
887
888 // Only 64 bit vector conversions are natively supported before z15.
889 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
890 if (SrcScalarBits == DstScalarBits)
891 return NumDstVectors;
892
893 if (SrcScalarBits == 1)
894 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
895 }
896
897 // Return the cost of multiple scalar invocation plus the cost of
898 // inserting and extracting the values. Base implementation does not
899 // realize float->int gets scalarized.
901 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
902 InstructionCost TotCost = VF * ScalarCost;
903 bool NeedsInserts = true, NeedsExtracts = true;
904 // FP128 registers do not get inserted or extracted.
905 if (DstScalarBits == 128 &&
906 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
907 NeedsInserts = false;
908 if (SrcScalarBits == 128 &&
909 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
910 NeedsExtracts = false;
911
912 TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
913 NeedsExtracts, CostKind);
914 TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
915 /*Extract*/ false, CostKind);
916
917 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
918 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
919 TotCost *= 2;
920
921 return TotCost;
922 }
923
924 if (Opcode == Instruction::FPTrunc) {
925 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
926 return VF /*ldxbr/lexbr*/ +
927 getScalarizationOverhead(DstVecTy, /*Insert*/ true,
928 /*Extract*/ false, CostKind);
929 else // double -> float
930 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
931 }
932
933 if (Opcode == Instruction::FPExt) {
934 if (SrcScalarBits == 32 && DstScalarBits == 64) {
935 // float -> double is very rare and currently unoptimized. Instead of
936 // using vldeb, which can do two at a time, all conversions are
937 // scalarized.
938 return VF * 2;
939 }
940 // -> fp128. VF * lxdb/lxeb + extraction of elements.
941 return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
942 /*Extract*/ true, CostKind);
943 }
944 }
945
946 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
947}
948
949// Scalar i8 / i16 operations will typically be made after first extending
950// the operands to i32.
951static unsigned getOperandsExtensionCost(const Instruction *I) {
952 unsigned ExtCost = 0;
953 for (Value *Op : I->operands())
954 // A load of i8 or i16 sign/zero extends to i32.
955 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
956 ExtCost++;
957
958 return ExtCost;
959}
960
962 Type *CondTy,
963 CmpInst::Predicate VecPred,
965 const Instruction *I) {
967 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
968
969 if (!ValTy->isVectorTy()) {
970 switch (Opcode) {
971 case Instruction::ICmp: {
972 // A loaded value compared with 0 with multiple users becomes Load and
973 // Test. The load is then not foldable, so return 0 cost for the ICmp.
974 unsigned ScalarBits = ValTy->getScalarSizeInBits();
975 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
976 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
977 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
978 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
979 C->isZero())
980 return 0;
981
982 unsigned Cost = 1;
983 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
984 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
985 return Cost;
986 }
987 case Instruction::Select:
988 if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
989 return 4; // No LOC for FP / i128 - costs a conditional jump.
990 return 1; // Load On Condition / Select Register.
991 }
992 }
993 else if (ST->hasVector()) {
994 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
995
996 // Called with a compare instruction.
997 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
998 unsigned PredicateExtraCost = 0;
999 if (I != nullptr) {
1000 // Some predicates cost one or two extra instructions.
1001 switch (cast<CmpInst>(I)->getPredicate()) {
1007 PredicateExtraCost = 1;
1008 break;
1013 PredicateExtraCost = 2;
1014 break;
1015 default:
1016 break;
1017 }
1018 }
1019
1020 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1021 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1022 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1023 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1024
1025 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1026 return Cost;
1027 }
1028 else { // Called with a select instruction.
1029 assert (Opcode == Instruction::Select);
1030
1031 // We can figure out the extra cost of packing / unpacking if the
1032 // instruction was passed and the compare instruction is found.
1033 unsigned PackCost = 0;
1034 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1035 if (CmpOpTy != nullptr)
1036 PackCost =
1037 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1038
1039 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1040 }
1041 }
1042
1043 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
1044}
1045
1048 unsigned Index, Value *Op0,
1049 Value *Op1) {
1050 // vlvgp will insert two grs into a vector register, so only count half the
1051 // number of instructions.
1052 if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1053 return ((Index % 2 == 0) ? 1 : 0);
1054
1055 if (Opcode == Instruction::ExtractElement) {
1056 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1057
1058 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1059 if (Index == 0 && Val->isIntOrIntVectorTy())
1060 Cost += 1;
1061
1062 return Cost;
1063 }
1064
1065 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1066}
1067
1068// Check if a load may be folded as a memory operand in its user.
1070isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1071 if (!Ld->hasOneUse())
1072 return false;
1073 FoldedValue = Ld;
1074 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1075 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1076 unsigned TruncBits = 0;
1077 unsigned SExtBits = 0;
1078 unsigned ZExtBits = 0;
1079 if (UserI->hasOneUse()) {
1080 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1081 if (isa<TruncInst>(UserI))
1082 TruncBits = UserBits;
1083 else if (isa<SExtInst>(UserI))
1084 SExtBits = UserBits;
1085 else if (isa<ZExtInst>(UserI))
1086 ZExtBits = UserBits;
1087 }
1088 if (TruncBits || SExtBits || ZExtBits) {
1089 FoldedValue = UserI;
1090 UserI = cast<Instruction>(*UserI->user_begin());
1091 // Load (single use) -> trunc/extend (single use) -> UserI
1092 }
1093 if ((UserI->getOpcode() == Instruction::Sub ||
1094 UserI->getOpcode() == Instruction::SDiv ||
1095 UserI->getOpcode() == Instruction::UDiv) &&
1096 UserI->getOperand(1) != FoldedValue)
1097 return false; // Not commutative, only RHS foldable.
1098 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1099 // extension was made of the load.
1100 unsigned LoadOrTruncBits =
1101 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1102 switch (UserI->getOpcode()) {
1103 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1104 case Instruction::Sub:
1105 case Instruction::ICmp:
1106 if (LoadedBits == 32 && ZExtBits == 64)
1107 return true;
1108 [[fallthrough]];
1109 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1110 if (UserI->getOpcode() != Instruction::ICmp) {
1111 if (LoadedBits == 16 &&
1112 (SExtBits == 32 ||
1113 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1114 return true;
1115 if (LoadOrTruncBits == 16)
1116 return true;
1117 }
1118 [[fallthrough]];
1119 case Instruction::SDiv:// SE: 32->64
1120 if (LoadedBits == 32 && SExtBits == 64)
1121 return true;
1122 [[fallthrough]];
1123 case Instruction::UDiv:
1124 case Instruction::And:
1125 case Instruction::Or:
1126 case Instruction::Xor:
1127 // This also makes sense for float operations, but disabled for now due
1128 // to regressions.
1129 // case Instruction::FCmp:
1130 // case Instruction::FAdd:
1131 // case Instruction::FSub:
1132 // case Instruction::FMul:
1133 // case Instruction::FDiv:
1134
1135 // All possible extensions of memory checked above.
1136
1137 // Comparison between memory and immediate.
1138 if (UserI->getOpcode() == Instruction::ICmp)
1139 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1140 if (CI->getValue().isIntN(16))
1141 return true;
1142 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1143 break;
1144 }
1145 return false;
1146}
1147
1148static bool isBswapIntrinsicCall(const Value *V) {
1149 if (const Instruction *I = dyn_cast<Instruction>(V))
1150 if (auto *CI = dyn_cast<CallInst>(I))
1151 if (auto *F = CI->getCalledFunction())
1152 if (F->getIntrinsicID() == Intrinsic::bswap)
1153 return true;
1154 return false;
1155}
1156
1158 MaybeAlign Alignment,
1159 unsigned AddressSpace,
1161 TTI::OperandValueInfo OpInfo,
1162 const Instruction *I) {
1163 assert(!Src->isVoidTy() && "Invalid type");
1164
1165 // TODO: Handle other cost kinds.
1167 return 1;
1168
1169 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1170 // Store the load or its truncated or extended value in FoldedValue.
1171 const Instruction *FoldedValue = nullptr;
1172 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1173 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1174 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1175
1176 // UserI can't fold two loads, so in that case return 0 cost only
1177 // half of the time.
1178 for (unsigned i = 0; i < 2; ++i) {
1179 if (UserI->getOperand(i) == FoldedValue)
1180 continue;
1181
1182 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1183 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1184 if (!OtherLoad &&
1185 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1186 isa<ZExtInst>(OtherOp)))
1187 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1188 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1189 return i == 0; // Both operands foldable.
1190 }
1191 }
1192
1193 return 0; // Only I is foldable in user.
1194 }
1195 }
1196
1197 // Type legalization (via getNumberOfParts) can't handle structs
1198 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1199 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1200 CostKind);
1201
1202 // FP128 is a legal type but kept in a register pair on older CPUs.
1203 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1204 return 2;
1205
1206 unsigned NumOps =
1207 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1208
1209 // Store/Load reversed saves one instruction.
1210 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1211 I != nullptr) {
1212 if (Opcode == Instruction::Load && I->hasOneUse()) {
1213 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1214 // In case of load -> bswap -> store, return normal cost for the load.
1215 if (isBswapIntrinsicCall(LdUser) &&
1216 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1217 return 0;
1218 }
1219 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1220 const Value *StoredVal = SI->getValueOperand();
1221 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1222 return 0;
1223 }
1224 }
1225
1226 return NumOps;
1227}
1228
1229// The generic implementation of getInterleavedMemoryOpCost() is based on
1230// adding costs of the memory operations plus all the extracts and inserts
1231// needed for using / defining the vector operands. The SystemZ version does
1232// roughly the same but bases the computations on vector permutations
1233// instead.
1235 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1236 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1237 bool UseMaskForCond, bool UseMaskForGaps) {
1238 if (UseMaskForCond || UseMaskForGaps)
1239 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1240 Alignment, AddressSpace, CostKind,
1241 UseMaskForCond, UseMaskForGaps);
1242 assert(isa<VectorType>(VecTy) &&
1243 "Expect a vector type for interleaved memory op");
1244
1245 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1246 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1247 unsigned VF = NumElts / Factor;
1248 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1249 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1250 unsigned NumPermutes = 0;
1251
1252 if (Opcode == Instruction::Load) {
1253 // Loading interleave groups may have gaps, which may mean fewer
1254 // loads. Find out how many vectors will be loaded in total, and in how
1255 // many of them each value will be in.
1256 BitVector UsedInsts(NumVectorMemOps, false);
1257 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1258 for (unsigned Index : Indices)
1259 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1260 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1261 UsedInsts.set(Vec);
1262 ValueVecs[Index].set(Vec);
1263 }
1264 NumVectorMemOps = UsedInsts.count();
1265
1266 for (unsigned Index : Indices) {
1267 // Estimate that each loaded source vector containing this Index
1268 // requires one operation, except that vperm can handle two input
1269 // registers first time for each dst vector.
1270 unsigned NumSrcVecs = ValueVecs[Index].count();
1271 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1272 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1273 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1274 }
1275 } else {
1276 // Estimate the permutes for each stored vector as the smaller of the
1277 // number of elements and the number of source vectors. Subtract one per
1278 // dst vector for vperm (S.A.).
1279 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1280 unsigned NumDstVecs = NumVectorMemOps;
1281 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1282 }
1283
1284 // Cost of load/store operations and the permutations needed.
1285 return NumVectorMemOps + NumPermutes;
1286}
1287
1288static int
1290 const SmallVectorImpl<Type *> &ParamTys) {
1291 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1292 return getNumVectorRegs(RetTy); // VPERM
1293
1294 if (ID == Intrinsic::vector_reduce_add) {
1295 // Retrieve number and size of elements for the vector op.
1296 auto *VTy = cast<FixedVectorType>(ParamTys.front());
1297 unsigned ScalarSize = VTy->getScalarSizeInBits();
1298 // For scalar sizes >128 bits, we fall back to the generic cost estimate.
1299 if (ScalarSize > SystemZ::VectorBits)
1300 return -1;
1301 // This many vector regs are needed to represent the input elements (V).
1302 unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
1303 // This many instructions are needed for the final sum of vector elems (S).
1304 unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
1305 // We use vector adds to create a sum vector, which takes
1306 // V/2 + V/4 + ... = V - 1 operations.
1307 // Then, we need S operations to sum up the elements of that sum vector,
1308 // for a total of V + S - 1 operations.
1309 int Cost = VectorRegsNeeded + LastVectorHandling - 1;
1310 return Cost;
1311 }
1312 return -1;
1313}
1314
1319 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
1320 if (Cost != -1)
1321 return Cost;
1323}
1324
1326 // Always expand on Subtargets without vector instructions
1327 if (!ST->hasVector())
1328 return true;
1329
1330 // Always expand for operands that do not fill one vector reg
1331 auto *Type = cast<FixedVectorType>(II->getOperand(0)->getType());
1332 unsigned NumElts = Type->getNumElements();
1333 unsigned ScalarSize = Type->getScalarSizeInBits();
1334 unsigned MaxElts = SystemZ::VectorBits / ScalarSize;
1335 if (NumElts < MaxElts)
1336 return true;
1337
1338 // Otherwise
1339 switch (II->getIntrinsicID()) {
1340 // Do not expand vector.reduce.add
1341 case Intrinsic::vector_reduce_add:
1342 // Except for i64, since the performance benefit is dubious there
1343 return ScalarSize >= 64;
1344 default:
1345 return true;
1346 }
1347}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
const HexagonInstrInfo * TII
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl< Type * > &ParamTys)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:891
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:969
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:762
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:654
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1467
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1715
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:956
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:966
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:996
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:990
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:974
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:977
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:975
@ ICMP_NE
not equal
Definition: InstrTypes.h:988
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:994
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:992
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:976
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:145
This is an important base class in LLVM.
Definition: Constant.h:41
This class represents an Operation in the Expression.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
An instruction for reading from memory.
Definition: Instructions.h:184
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class wraps the llvm.memcpy intrinsic.
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
const SystemZInstrInfo * getInstrInfo() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool shouldExpandReduction(const IntrinsicInst *II) const
unsigned getNumberOfRegisters(unsigned ClassID) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool hasDivRemOp(Type *DataType, bool IsSigned)
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:330
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:333
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:187
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const unsigned VectorBits
Definition: SystemZ.h:154
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...