LLVM 20.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/Support/Debug.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "systemztti"
30
31//===----------------------------------------------------------------------===//
32//
33// SystemZ cost model.
34//
35//===----------------------------------------------------------------------===//
36
37static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
38 bool UsedAsMemCpySource = false;
39 for (const User *U : V->users())
40 if (const Instruction *User = dyn_cast<Instruction>(U)) {
41 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
42 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
43 continue;
44 }
45 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
46 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
47 UsedAsMemCpySource = true;
48 continue;
49 }
50 }
51 OtherUse = true;
52 }
53 return UsedAsMemCpySource;
54}
55
57 unsigned Bonus = 0;
58
59 // Increase the threshold if an incoming argument is used only as a memcpy
60 // source.
61 if (Function *Callee = CB->getCalledFunction())
62 for (Argument &Arg : Callee->args()) {
63 bool OtherUse = false;
64 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse)
65 Bonus += 150;
66 }
67
68 LLVM_DEBUG(if (Bonus)
69 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
70 return Bonus;
71}
72
75 assert(Ty->isIntegerTy());
76
77 unsigned BitSize = Ty->getPrimitiveSizeInBits();
78 // There is no cost model for constants with a bit size of 0. Return TCC_Free
79 // here, so that constant hoisting will ignore this constant.
80 if (BitSize == 0)
81 return TTI::TCC_Free;
82 // No cost model for operations on integers larger than 128 bit implemented yet.
83 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
84 return TTI::TCC_Free;
85
86 if (Imm == 0)
87 return TTI::TCC_Free;
88
89 if (Imm.getBitWidth() <= 64) {
90 // Constants loaded via lgfi.
91 if (isInt<32>(Imm.getSExtValue()))
92 return TTI::TCC_Basic;
93 // Constants loaded via llilf.
94 if (isUInt<32>(Imm.getZExtValue()))
95 return TTI::TCC_Basic;
96 // Constants loaded via llihf:
97 if ((Imm.getZExtValue() & 0xffffffff) == 0)
98 return TTI::TCC_Basic;
99
100 return 2 * TTI::TCC_Basic;
101 }
102
103 // i128 immediates loads from Constant Pool
104 return 2 * TTI::TCC_Basic;
105}
106
108 const APInt &Imm, Type *Ty,
110 Instruction *Inst) {
111 assert(Ty->isIntegerTy());
112
113 unsigned BitSize = Ty->getPrimitiveSizeInBits();
114 // There is no cost model for constants with a bit size of 0. Return TCC_Free
115 // here, so that constant hoisting will ignore this constant.
116 if (BitSize == 0)
117 return TTI::TCC_Free;
118 // No cost model for operations on integers larger than 64 bit implemented yet.
119 if (BitSize > 64)
120 return TTI::TCC_Free;
121
122 switch (Opcode) {
123 default:
124 return TTI::TCC_Free;
125 case Instruction::GetElementPtr:
126 // Always hoist the base address of a GetElementPtr. This prevents the
127 // creation of new constants for every base constant that gets constant
128 // folded with the offset.
129 if (Idx == 0)
130 return 2 * TTI::TCC_Basic;
131 return TTI::TCC_Free;
132 case Instruction::Store:
133 if (Idx == 0 && Imm.getBitWidth() <= 64) {
134 // Any 8-bit immediate store can by implemented via mvi.
135 if (BitSize == 8)
136 return TTI::TCC_Free;
137 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
138 if (isInt<16>(Imm.getSExtValue()))
139 return TTI::TCC_Free;
140 }
141 break;
142 case Instruction::ICmp:
143 if (Idx == 1 && Imm.getBitWidth() <= 64) {
144 // Comparisons against signed 32-bit immediates implemented via cgfi.
145 if (isInt<32>(Imm.getSExtValue()))
146 return TTI::TCC_Free;
147 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
148 if (isUInt<32>(Imm.getZExtValue()))
149 return TTI::TCC_Free;
150 }
151 break;
152 case Instruction::Add:
153 case Instruction::Sub:
154 if (Idx == 1 && Imm.getBitWidth() <= 64) {
155 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
156 if (isUInt<32>(Imm.getZExtValue()))
157 return TTI::TCC_Free;
158 // Or their negation, by swapping addition vs. subtraction.
159 if (isUInt<32>(-Imm.getSExtValue()))
160 return TTI::TCC_Free;
161 }
162 break;
163 case Instruction::Mul:
164 if (Idx == 1 && Imm.getBitWidth() <= 64) {
165 // We use msgfi to multiply by 32-bit signed immediates.
166 if (isInt<32>(Imm.getSExtValue()))
167 return TTI::TCC_Free;
168 }
169 break;
170 case Instruction::Or:
171 case Instruction::Xor:
172 if (Idx == 1 && Imm.getBitWidth() <= 64) {
173 // Masks supported by oilf/xilf.
174 if (isUInt<32>(Imm.getZExtValue()))
175 return TTI::TCC_Free;
176 // Masks supported by oihf/xihf.
177 if ((Imm.getZExtValue() & 0xffffffff) == 0)
178 return TTI::TCC_Free;
179 }
180 break;
181 case Instruction::And:
182 if (Idx == 1 && Imm.getBitWidth() <= 64) {
183 // Any 32-bit AND operation can by implemented via nilf.
184 if (BitSize <= 32)
185 return TTI::TCC_Free;
186 // 64-bit masks supported by nilf.
187 if (isUInt<32>(~Imm.getZExtValue()))
188 return TTI::TCC_Free;
189 // 64-bit masks supported by nilh.
190 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
191 return TTI::TCC_Free;
192 // Some 64-bit AND operations can be implemented via risbg.
193 const SystemZInstrInfo *TII = ST->getInstrInfo();
194 unsigned Start, End;
195 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
196 return TTI::TCC_Free;
197 }
198 break;
199 case Instruction::Shl:
200 case Instruction::LShr:
201 case Instruction::AShr:
202 // Always return TCC_Free for the shift value of a shift instruction.
203 if (Idx == 1)
204 return TTI::TCC_Free;
205 break;
206 case Instruction::UDiv:
207 case Instruction::SDiv:
208 case Instruction::URem:
209 case Instruction::SRem:
210 case Instruction::Trunc:
211 case Instruction::ZExt:
212 case Instruction::SExt:
213 case Instruction::IntToPtr:
214 case Instruction::PtrToInt:
215 case Instruction::BitCast:
216 case Instruction::PHI:
217 case Instruction::Call:
218 case Instruction::Select:
219 case Instruction::Ret:
220 case Instruction::Load:
221 break;
222 }
223
225}
226
229 const APInt &Imm, Type *Ty,
231 assert(Ty->isIntegerTy());
232
233 unsigned BitSize = Ty->getPrimitiveSizeInBits();
234 // There is no cost model for constants with a bit size of 0. Return TCC_Free
235 // here, so that constant hoisting will ignore this constant.
236 if (BitSize == 0)
237 return TTI::TCC_Free;
238 // No cost model for operations on integers larger than 64 bit implemented yet.
239 if (BitSize > 64)
240 return TTI::TCC_Free;
241
242 switch (IID) {
243 default:
244 return TTI::TCC_Free;
245 case Intrinsic::sadd_with_overflow:
246 case Intrinsic::uadd_with_overflow:
247 case Intrinsic::ssub_with_overflow:
248 case Intrinsic::usub_with_overflow:
249 // These get expanded to include a normal addition/subtraction.
250 if (Idx == 1 && Imm.getBitWidth() <= 64) {
251 if (isUInt<32>(Imm.getZExtValue()))
252 return TTI::TCC_Free;
253 if (isUInt<32>(-Imm.getSExtValue()))
254 return TTI::TCC_Free;
255 }
256 break;
257 case Intrinsic::smul_with_overflow:
258 case Intrinsic::umul_with_overflow:
259 // These get expanded to include a normal multiplication.
260 if (Idx == 1 && Imm.getBitWidth() <= 64) {
261 if (isInt<32>(Imm.getSExtValue()))
262 return TTI::TCC_Free;
263 }
264 break;
265 case Intrinsic::experimental_stackmap:
266 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
267 return TTI::TCC_Free;
268 break;
269 case Intrinsic::experimental_patchpoint_void:
270 case Intrinsic::experimental_patchpoint:
271 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
272 return TTI::TCC_Free;
273 break;
274 }
276}
277
280 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
281 if (ST->hasPopulationCount() && TyWidth <= 64)
283 return TTI::PSK_Software;
284}
285
289 // Find out if L contains a call, what the machine instruction count
290 // estimate is, and how many stores there are.
291 bool HasCall = false;
292 InstructionCost NumStores = 0;
293 for (auto &BB : L->blocks())
294 for (auto &I : *BB) {
295 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
296 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
297 if (isLoweredToCall(F))
298 HasCall = true;
299 if (F->getIntrinsicID() == Intrinsic::memcpy ||
300 F->getIntrinsicID() == Intrinsic::memset)
301 NumStores++;
302 } else { // indirect call.
303 HasCall = true;
304 }
305 }
306 if (isa<StoreInst>(&I)) {
307 Type *MemAccessTy = I.getOperand(0)->getType();
308 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
309 std::nullopt, 0, TTI::TCK_RecipThroughput);
310 }
311 }
312
313 // The z13 processor will run out of store tags if too many stores
314 // are fed into it too quickly. Therefore make sure there are not
315 // too many stores in the resulting unrolled loop.
316 unsigned const NumStoresVal = *NumStores.getValue();
317 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
318
319 if (HasCall) {
320 // Only allow full unrolling if loop has any calls.
321 UP.FullUnrollMaxCount = Max;
322 UP.MaxCount = 1;
323 return;
324 }
325
326 UP.MaxCount = Max;
327 if (UP.MaxCount <= 1)
328 return;
329
330 // Allow partial and runtime trip count unrolling.
331 UP.Partial = UP.Runtime = true;
332
333 UP.PartialThreshold = 75;
335
336 // Allow expensive instructions in the pre-header of the loop.
337 UP.AllowExpensiveTripCount = true;
338
339 UP.Force = true;
340}
341
345}
346
349 // SystemZ specific: check instruction count (first), and don't care about
350 // ImmCost, since offsets are checked explicitly.
351 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
352 C1.NumIVMuls, C1.NumBaseAdds,
353 C1.ScaleCost, C1.SetupCost) <
354 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
355 C2.NumIVMuls, C2.NumBaseAdds,
356 C2.ScaleCost, C2.SetupCost);
357}
358
359unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
360 bool Vector = (ClassID == 1);
361 if (!Vector)
362 // Discount the stack pointer. Also leave out %r0, since it can't
363 // be used in an address.
364 return 14;
365 if (ST->hasVector())
366 return 32;
367 return 0;
368}
369
372 switch (K) {
374 return TypeSize::getFixed(64);
376 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
378 return TypeSize::getScalable(0);
379 }
380
381 llvm_unreachable("Unsupported register kind");
382}
383
384unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
385 unsigned NumStridedMemAccesses,
386 unsigned NumPrefetches,
387 bool HasCall) const {
388 // Don't prefetch a loop with many far apart accesses.
389 if (NumPrefetches > 16)
390 return UINT_MAX;
391
392 // Emit prefetch instructions for smaller strides in cases where we think
393 // the hardware prefetcher might not be able to keep up.
394 if (NumStridedMemAccesses > 32 && !HasCall &&
395 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
396 return 1;
397
398 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
399}
400
401bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
402 EVT VT = TLI->getValueType(DL, DataType);
403 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
404}
405
406// Return the bit size for the scalar type or vector element
407// type. getScalarSizeInBits() returns 0 for a pointer type.
408static unsigned getScalarSizeInBits(Type *Ty) {
409 unsigned Size =
410 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
411 assert(Size > 0 && "Element must have non-zero size.");
412 return Size;
413}
414
415// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
416// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
417// 3.
418static unsigned getNumVectorRegs(Type *Ty) {
419 auto *VTy = cast<FixedVectorType>(Ty);
420 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
421 assert(WideBits > 0 && "Could not compute size of vector");
422 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
423}
424
426 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
429 const Instruction *CxtI) {
430
431 // TODO: Handle more cost kinds.
433 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
434 Op2Info, Args, CxtI);
435
436 // TODO: return a good value for BB-VECTORIZER that includes the
437 // immediate loads, which we do not want to count for the loop
438 // vectorizer, since they are hopefully hoisted out of the loop. This
439 // would require a new parameter 'InLoop', but not sure if constant
440 // args are common enough to motivate this.
441
442 unsigned ScalarBits = Ty->getScalarSizeInBits();
443
444 // There are thre cases of division and remainder: Dividing with a register
445 // needs a divide instruction. A divisor which is a power of two constant
446 // can be implemented with a sequence of shifts. Any other constant needs a
447 // multiply and shifts.
448 const unsigned DivInstrCost = 20;
449 const unsigned DivMulSeqCost = 10;
450 const unsigned SDivPow2Cost = 4;
451
452 bool SignedDivRem =
453 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
454 bool UnsignedDivRem =
455 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
456
457 // Check for a constant divisor.
458 bool DivRemConst = false;
459 bool DivRemConstPow2 = false;
460 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
461 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
462 const ConstantInt *CVal =
463 (C->getType()->isVectorTy()
464 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
465 : dyn_cast<const ConstantInt>(C));
466 if (CVal && (CVal->getValue().isPowerOf2() ||
467 CVal->getValue().isNegatedPowerOf2()))
468 DivRemConstPow2 = true;
469 else
470 DivRemConst = true;
471 }
472 }
473
474 if (!Ty->isVectorTy()) {
475 // These FP operations are supported with a dedicated instruction for
476 // float, double and fp128 (base implementation assumes float generally
477 // costs 2).
478 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
479 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
480 return 1;
481
482 // There is no native support for FRem.
483 if (Opcode == Instruction::FRem)
484 return LIBCALL_COST;
485
486 // Give discount for some combined logical operations if supported.
487 if (Args.size() == 2) {
488 if (Opcode == Instruction::Xor) {
489 for (const Value *A : Args) {
490 if (const Instruction *I = dyn_cast<Instruction>(A))
491 if (I->hasOneUse() &&
492 (I->getOpcode() == Instruction::Or ||
493 I->getOpcode() == Instruction::And ||
494 I->getOpcode() == Instruction::Xor))
495 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
496 (isInt128InVR(Ty) &&
497 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
498 return 0;
499 }
500 }
501 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
502 for (const Value *A : Args) {
503 if (const Instruction *I = dyn_cast<Instruction>(A))
504 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
505 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
506 (isInt128InVR(Ty) &&
507 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
508 return 0;
509 }
510 }
511 }
512
513 // Or requires one instruction, although it has custom handling for i64.
514 if (Opcode == Instruction::Or)
515 return 1;
516
517 if (Opcode == Instruction::Xor && ScalarBits == 1) {
518 if (ST->hasLoadStoreOnCond2())
519 return 5; // 2 * (li 0; loc 1); xor
520 return 7; // 2 * ipm sequences ; xor ; shift ; compare
521 }
522
523 if (DivRemConstPow2)
524 return (SignedDivRem ? SDivPow2Cost : 1);
525 if (DivRemConst)
526 return DivMulSeqCost;
527 if (SignedDivRem || UnsignedDivRem)
528 return DivInstrCost;
529 }
530 else if (ST->hasVector()) {
531 auto *VTy = cast<FixedVectorType>(Ty);
532 unsigned VF = VTy->getNumElements();
533 unsigned NumVectors = getNumVectorRegs(Ty);
534
535 // These vector operations are custom handled, but are still supported
536 // with one instruction per vector, regardless of element size.
537 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
538 Opcode == Instruction::AShr) {
539 return NumVectors;
540 }
541
542 if (DivRemConstPow2)
543 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
544 if (DivRemConst) {
545 SmallVector<Type *> Tys(Args.size(), Ty);
546 return VF * DivMulSeqCost +
547 getScalarizationOverhead(VTy, Args, Tys, CostKind);
548 }
549 if ((SignedDivRem || UnsignedDivRem) && VF > 4)
550 // Temporary hack: disable high vectorization factors with integer
551 // division/remainder, which will get scalarized and handled with
552 // GR128 registers. The mischeduler is not clever enough to avoid
553 // spilling yet.
554 return 1000;
555
556 // These FP operations are supported with a single vector instruction for
557 // double (base implementation assumes float generally costs 2). For
558 // FP128, the scalar cost is 1, and there is no overhead since the values
559 // are already in scalar registers.
560 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
561 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
562 switch (ScalarBits) {
563 case 32: {
564 // The vector enhancements facility 1 provides v4f32 instructions.
565 if (ST->hasVectorEnhancements1())
566 return NumVectors;
567 // Return the cost of multiple scalar invocation plus the cost of
568 // inserting and extracting the values.
569 InstructionCost ScalarCost =
571 SmallVector<Type *> Tys(Args.size(), Ty);
573 (VF * ScalarCost) +
574 getScalarizationOverhead(VTy, Args, Tys, CostKind);
575 // FIXME: VF 2 for these FP operations are currently just as
576 // expensive as for VF 4.
577 if (VF == 2)
578 Cost *= 2;
579 return Cost;
580 }
581 case 64:
582 case 128:
583 return NumVectors;
584 default:
585 break;
586 }
587 }
588
589 // There is no native support for FRem.
590 if (Opcode == Instruction::FRem) {
591 SmallVector<Type *> Tys(Args.size(), Ty);
592 InstructionCost Cost = (VF * LIBCALL_COST) +
593 getScalarizationOverhead(VTy, Args, Tys, CostKind);
594 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
595 if (VF == 2 && ScalarBits == 32)
596 Cost *= 2;
597 return Cost;
598 }
599 }
600
601 // Fallback to the default implementation.
602 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
603 Args, CxtI);
604}
605
609 ArrayRef<const Value *> Args, const Instruction *CxtI) {
610 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
611 if (ST->hasVector()) {
612 unsigned NumVectors = getNumVectorRegs(Tp);
613
614 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
615
616 // FP128 values are always in scalar registers, so there is no work
617 // involved with a shuffle, except for broadcast. In that case register
618 // moves are done with a single instruction per element.
619 if (Tp->getScalarType()->isFP128Ty())
620 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
621
622 switch (Kind) {
624 // ExtractSubvector Index indicates start offset.
625
626 // Extracting a subvector from first index is a noop.
627 return (Index == 0 ? 0 : NumVectors);
628
630 // Loop vectorizer calls here to figure out the extra cost of
631 // broadcasting a loaded value to all elements of a vector. Since vlrep
632 // loads and replicates with a single instruction, adjust the returned
633 // value.
634 return NumVectors - 1;
635
636 default:
637
638 // SystemZ supports single instruction permutation / replication.
639 return NumVectors;
640 }
641 }
642
643 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
644}
645
646// Return the log2 difference of the element sizes of the two vector types.
647static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
648 unsigned Bits0 = Ty0->getScalarSizeInBits();
649 unsigned Bits1 = Ty1->getScalarSizeInBits();
650
651 if (Bits1 > Bits0)
652 return (Log2_32(Bits1) - Log2_32(Bits0));
653
654 return (Log2_32(Bits0) - Log2_32(Bits1));
655}
656
657// Return the number of instructions needed to truncate SrcTy to DstTy.
659getVectorTruncCost(Type *SrcTy, Type *DstTy) {
660 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
663 "Packing must reduce size of vector type.");
664 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
665 cast<FixedVectorType>(DstTy)->getNumElements() &&
666 "Packing should not change number of elements.");
667
668 // TODO: Since fp32 is expanded, the extract cost should always be 0.
669
670 unsigned NumParts = getNumVectorRegs(SrcTy);
671 if (NumParts <= 2)
672 // Up to 2 vector registers can be truncated efficiently with pack or
673 // permute. The latter requires an immediate mask to be loaded, which
674 // typically gets hoisted out of a loop. TODO: return a good value for
675 // BB-VECTORIZER that includes the immediate loads, which we do not want
676 // to count for the loop vectorizer.
677 return 1;
678
679 unsigned Cost = 0;
680 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
681 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
682 for (unsigned P = 0; P < Log2Diff; ++P) {
683 if (NumParts > 1)
684 NumParts /= 2;
685 Cost += NumParts;
686 }
687
688 // Currently, a general mix of permutes and pack instructions is output by
689 // isel, which follow the cost computation above except for this case which
690 // is one instruction less:
691 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
692 DstTy->getScalarSizeInBits() == 8)
693 Cost--;
694
695 return Cost;
696}
697
698// Return the cost of converting a vector bitmask produced by a compare
699// (SrcTy), to the type of the select or extend instruction (DstTy).
702 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
703 "Should only be called with vector types.");
704
705 unsigned PackCost = 0;
706 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
707 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
708 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
709 if (SrcScalarBits > DstScalarBits)
710 // The bitmask will be truncated.
711 PackCost = getVectorTruncCost(SrcTy, DstTy);
712 else if (SrcScalarBits < DstScalarBits) {
713 unsigned DstNumParts = getNumVectorRegs(DstTy);
714 // Each vector select needs its part of the bitmask unpacked.
715 PackCost = Log2Diff * DstNumParts;
716 // Extra cost for moving part of mask before unpacking.
717 PackCost += DstNumParts - 1;
718 }
719
720 return PackCost;
721}
722
723// Return the type of the compared operands. This is needed to compute the
724// cost for a Select / ZExt or SExt instruction.
725static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
726 Type *OpTy = nullptr;
727 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
728 OpTy = CI->getOperand(0)->getType();
729 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
730 if (LogicI->getNumOperands() == 2)
731 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
732 if (isa<CmpInst>(LogicI->getOperand(1)))
733 OpTy = CI0->getOperand(0)->getType();
734
735 if (OpTy != nullptr) {
736 if (VF == 1) {
737 assert (!OpTy->isVectorTy() && "Expected scalar type");
738 return OpTy;
739 }
740 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
741 // be either scalar or already vectorized with a same or lesser VF.
742 Type *ElTy = OpTy->getScalarType();
743 return FixedVectorType::get(ElTy, VF);
744 }
745
746 return nullptr;
747}
748
749// Get the cost of converting a boolean vector to a vector with same width
750// and element size as Dst, plus the cost of zero extending if needed.
752getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
753 const Instruction *I) {
754 auto *DstVTy = cast<FixedVectorType>(Dst);
755 unsigned VF = DstVTy->getNumElements();
756 unsigned Cost = 0;
757 // If we know what the widths of the compared operands, get any cost of
758 // converting it to match Dst. Otherwise assume same widths.
759 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
760 if (CmpOpTy != nullptr)
761 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
762 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
763 // One 'vn' per dst vector with an immediate mask.
764 Cost += getNumVectorRegs(Dst);
765 return Cost;
766}
767
769 Type *Src,
772 const Instruction *I) {
773 // FIXME: Can the logic below also be used for these cost kinds?
775 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
776 return BaseCost == 0 ? BaseCost : 1;
777 }
778
779 unsigned DstScalarBits = Dst->getScalarSizeInBits();
780 unsigned SrcScalarBits = Src->getScalarSizeInBits();
781
782 if (!Src->isVectorTy()) {
783 assert (!Dst->isVectorTy());
784
785 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
786 if (Src->isIntegerTy(128))
787 return LIBCALL_COST;
788 if (SrcScalarBits >= 32 ||
789 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
790 return 1;
791 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
792 }
793
794 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
795 Dst->isIntegerTy(128))
796 return LIBCALL_COST;
797
798 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
799 if (Src->isIntegerTy(1)) {
800 if (DstScalarBits == 128)
801 return 5 /*branch seq.*/;
802
803 if (ST->hasLoadStoreOnCond2())
804 return 2; // li 0; loc 1
805
806 // This should be extension of a compare i1 result, which is done with
807 // ipm and a varying sequence of instructions.
808 unsigned Cost = 0;
809 if (Opcode == Instruction::SExt)
810 Cost = (DstScalarBits < 64 ? 3 : 4);
811 if (Opcode == Instruction::ZExt)
812 Cost = 3;
813 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
814 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
815 // If operands of an fp-type was compared, this costs +1.
816 Cost++;
817 return Cost;
818 }
819 else if (isInt128InVR(Dst)) {
820 // Extensions from GPR to i128 (in VR) typically costs two instructions,
821 // but a zero-extending load would be just one extra instruction.
822 if (Opcode == Instruction::ZExt && I != nullptr)
823 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
824 if (Ld->hasOneUse())
825 return 1;
826 return 2;
827 }
828 }
829
830 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
831 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
832 if (Ld->hasOneUse())
833 return 0; // Will be converted to GPR load.
834 bool OnlyTruncatingStores = true;
835 for (const User *U : I->users())
836 if (!isa<StoreInst>(U)) {
837 OnlyTruncatingStores = false;
838 break;
839 }
840 if (OnlyTruncatingStores)
841 return 0;
842 return 2; // Vector element extraction.
843 }
844 }
845 else if (ST->hasVector()) {
846 // Vector to scalar cast.
847 auto *SrcVecTy = cast<FixedVectorType>(Src);
848 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
849 if (!DstVecTy) {
850 // TODO: tune vector-to-scalar cast.
851 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
852 }
853 unsigned VF = SrcVecTy->getNumElements();
854 unsigned NumDstVectors = getNumVectorRegs(Dst);
855 unsigned NumSrcVectors = getNumVectorRegs(Src);
856
857 if (Opcode == Instruction::Trunc) {
858 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
859 return 0; // Check for NOOP conversions.
860 return getVectorTruncCost(Src, Dst);
861 }
862
863 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
864 if (SrcScalarBits >= 8) {
865 // ZExt will use either a single unpack or a vector permute.
866 if (Opcode == Instruction::ZExt)
867 return NumDstVectors;
868
869 // SExt will be handled with one unpack per doubling of width.
870 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
871
872 // For types that spans multiple vector registers, some additional
873 // instructions are used to setup the unpacking.
874 unsigned NumSrcVectorOps =
875 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
876 : (NumDstVectors / 2));
877
878 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
879 }
880 else if (SrcScalarBits == 1)
881 return getBoolVecToIntConversionCost(Opcode, Dst, I);
882 }
883
884 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
885 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
886 // TODO: Fix base implementation which could simplify things a bit here
887 // (seems to miss on differentiating on scalar/vector types).
888
889 // Only 64 bit vector conversions are natively supported before z15.
890 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
891 if (SrcScalarBits == DstScalarBits)
892 return NumDstVectors;
893
894 if (SrcScalarBits == 1)
895 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
896 }
897
898 // Return the cost of multiple scalar invocation plus the cost of
899 // inserting and extracting the values. Base implementation does not
900 // realize float->int gets scalarized.
902 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
903 InstructionCost TotCost = VF * ScalarCost;
904 bool NeedsInserts = true, NeedsExtracts = true;
905 // FP128 registers do not get inserted or extracted.
906 if (DstScalarBits == 128 &&
907 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
908 NeedsInserts = false;
909 if (SrcScalarBits == 128 &&
910 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
911 NeedsExtracts = false;
912
913 TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
914 NeedsExtracts, CostKind);
915 TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts,
916 /*Extract*/ false, CostKind);
917
918 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
919 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
920 TotCost *= 2;
921
922 return TotCost;
923 }
924
925 if (Opcode == Instruction::FPTrunc) {
926 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
927 return VF /*ldxbr/lexbr*/ +
928 getScalarizationOverhead(DstVecTy, /*Insert*/ true,
929 /*Extract*/ false, CostKind);
930 else // double -> float
931 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
932 }
933
934 if (Opcode == Instruction::FPExt) {
935 if (SrcScalarBits == 32 && DstScalarBits == 64) {
936 // float -> double is very rare and currently unoptimized. Instead of
937 // using vldeb, which can do two at a time, all conversions are
938 // scalarized.
939 return VF * 2;
940 }
941 // -> fp128. VF * lxdb/lxeb + extraction of elements.
942 return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
943 /*Extract*/ true, CostKind);
944 }
945 }
946
947 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
948}
949
950// Scalar i8 / i16 operations will typically be made after first extending
951// the operands to i32.
952static unsigned getOperandsExtensionCost(const Instruction *I) {
953 unsigned ExtCost = 0;
954 for (Value *Op : I->operands())
955 // A load of i8 or i16 sign/zero extends to i32.
956 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
957 ExtCost++;
958
959 return ExtCost;
960}
961
963 Type *CondTy,
964 CmpInst::Predicate VecPred,
966 const Instruction *I) {
968 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
969
970 if (!ValTy->isVectorTy()) {
971 switch (Opcode) {
972 case Instruction::ICmp: {
973 // A loaded value compared with 0 with multiple users becomes Load and
974 // Test. The load is then not foldable, so return 0 cost for the ICmp.
975 unsigned ScalarBits = ValTy->getScalarSizeInBits();
976 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
977 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
978 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
979 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
980 C->isZero())
981 return 0;
982
983 unsigned Cost = 1;
984 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
985 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
986 return Cost;
987 }
988 case Instruction::Select:
989 if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
990 return 4; // No LOC for FP / i128 - costs a conditional jump.
991 return 1; // Load On Condition / Select Register.
992 }
993 }
994 else if (ST->hasVector()) {
995 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
996
997 // Called with a compare instruction.
998 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
999 unsigned PredicateExtraCost = 0;
1000 if (I != nullptr) {
1001 // Some predicates cost one or two extra instructions.
1002 switch (cast<CmpInst>(I)->getPredicate()) {
1008 PredicateExtraCost = 1;
1009 break;
1014 PredicateExtraCost = 2;
1015 break;
1016 default:
1017 break;
1018 }
1019 }
1020
1021 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1022 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1023 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1024 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1025
1026 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1027 return Cost;
1028 }
1029 else { // Called with a select instruction.
1030 assert (Opcode == Instruction::Select);
1031
1032 // We can figure out the extra cost of packing / unpacking if the
1033 // instruction was passed and the compare instruction is found.
1034 unsigned PackCost = 0;
1035 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1036 if (CmpOpTy != nullptr)
1037 PackCost =
1038 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1039
1040 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1041 }
1042 }
1043
1044 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
1045}
1046
1049 unsigned Index, Value *Op0,
1050 Value *Op1) {
1051 // vlvgp will insert two grs into a vector register, so only count half the
1052 // number of instructions.
1053 if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64))
1054 return ((Index % 2 == 0) ? 1 : 0);
1055
1056 if (Opcode == Instruction::ExtractElement) {
1057 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1058
1059 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1060 if (Index == 0 && Val->isIntOrIntVectorTy())
1061 Cost += 1;
1062
1063 return Cost;
1064 }
1065
1066 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1067}
1068
1069// Check if a load may be folded as a memory operand in its user.
1071isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1072 if (!Ld->hasOneUse())
1073 return false;
1074 FoldedValue = Ld;
1075 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1076 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1077 unsigned TruncBits = 0;
1078 unsigned SExtBits = 0;
1079 unsigned ZExtBits = 0;
1080 if (UserI->hasOneUse()) {
1081 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1082 if (isa<TruncInst>(UserI))
1083 TruncBits = UserBits;
1084 else if (isa<SExtInst>(UserI))
1085 SExtBits = UserBits;
1086 else if (isa<ZExtInst>(UserI))
1087 ZExtBits = UserBits;
1088 }
1089 if (TruncBits || SExtBits || ZExtBits) {
1090 FoldedValue = UserI;
1091 UserI = cast<Instruction>(*UserI->user_begin());
1092 // Load (single use) -> trunc/extend (single use) -> UserI
1093 }
1094 if ((UserI->getOpcode() == Instruction::Sub ||
1095 UserI->getOpcode() == Instruction::SDiv ||
1096 UserI->getOpcode() == Instruction::UDiv) &&
1097 UserI->getOperand(1) != FoldedValue)
1098 return false; // Not commutative, only RHS foldable.
1099 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1100 // extension was made of the load.
1101 unsigned LoadOrTruncBits =
1102 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1103 switch (UserI->getOpcode()) {
1104 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1105 case Instruction::Sub:
1106 case Instruction::ICmp:
1107 if (LoadedBits == 32 && ZExtBits == 64)
1108 return true;
1109 [[fallthrough]];
1110 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1111 if (UserI->getOpcode() != Instruction::ICmp) {
1112 if (LoadedBits == 16 &&
1113 (SExtBits == 32 ||
1114 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1115 return true;
1116 if (LoadOrTruncBits == 16)
1117 return true;
1118 }
1119 [[fallthrough]];
1120 case Instruction::SDiv:// SE: 32->64
1121 if (LoadedBits == 32 && SExtBits == 64)
1122 return true;
1123 [[fallthrough]];
1124 case Instruction::UDiv:
1125 case Instruction::And:
1126 case Instruction::Or:
1127 case Instruction::Xor:
1128 // This also makes sense for float operations, but disabled for now due
1129 // to regressions.
1130 // case Instruction::FCmp:
1131 // case Instruction::FAdd:
1132 // case Instruction::FSub:
1133 // case Instruction::FMul:
1134 // case Instruction::FDiv:
1135
1136 // All possible extensions of memory checked above.
1137
1138 // Comparison between memory and immediate.
1139 if (UserI->getOpcode() == Instruction::ICmp)
1140 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1141 if (CI->getValue().isIntN(16))
1142 return true;
1143 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1144 break;
1145 }
1146 return false;
1147}
1148
1149static bool isBswapIntrinsicCall(const Value *V) {
1150 if (const Instruction *I = dyn_cast<Instruction>(V))
1151 if (auto *CI = dyn_cast<CallInst>(I))
1152 if (auto *F = CI->getCalledFunction())
1153 if (F->getIntrinsicID() == Intrinsic::bswap)
1154 return true;
1155 return false;
1156}
1157
1159 MaybeAlign Alignment,
1160 unsigned AddressSpace,
1162 TTI::OperandValueInfo OpInfo,
1163 const Instruction *I) {
1164 assert(!Src->isVoidTy() && "Invalid type");
1165
1166 // TODO: Handle other cost kinds.
1168 return 1;
1169
1170 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1171 // Store the load or its truncated or extended value in FoldedValue.
1172 const Instruction *FoldedValue = nullptr;
1173 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1174 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1175 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1176
1177 // UserI can't fold two loads, so in that case return 0 cost only
1178 // half of the time.
1179 for (unsigned i = 0; i < 2; ++i) {
1180 if (UserI->getOperand(i) == FoldedValue)
1181 continue;
1182
1183 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1184 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1185 if (!OtherLoad &&
1186 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1187 isa<ZExtInst>(OtherOp)))
1188 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1189 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1190 return i == 0; // Both operands foldable.
1191 }
1192 }
1193
1194 return 0; // Only I is foldable in user.
1195 }
1196 }
1197
1198 // Type legalization (via getNumberOfParts) can't handle structs
1199 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1200 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1201 CostKind);
1202
1203 // FP128 is a legal type but kept in a register pair on older CPUs.
1204 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1205 return 2;
1206
1207 unsigned NumOps =
1208 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1209
1210 // Store/Load reversed saves one instruction.
1211 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1212 I != nullptr) {
1213 if (Opcode == Instruction::Load && I->hasOneUse()) {
1214 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1215 // In case of load -> bswap -> store, return normal cost for the load.
1216 if (isBswapIntrinsicCall(LdUser) &&
1217 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1218 return 0;
1219 }
1220 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1221 const Value *StoredVal = SI->getValueOperand();
1222 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1223 return 0;
1224 }
1225 }
1226
1227 return NumOps;
1228}
1229
1230// The generic implementation of getInterleavedMemoryOpCost() is based on
1231// adding costs of the memory operations plus all the extracts and inserts
1232// needed for using / defining the vector operands. The SystemZ version does
1233// roughly the same but bases the computations on vector permutations
1234// instead.
1236 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1237 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1238 bool UseMaskForCond, bool UseMaskForGaps) {
1239 if (UseMaskForCond || UseMaskForGaps)
1240 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1241 Alignment, AddressSpace, CostKind,
1242 UseMaskForCond, UseMaskForGaps);
1243 assert(isa<VectorType>(VecTy) &&
1244 "Expect a vector type for interleaved memory op");
1245
1246 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1247 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1248 unsigned VF = NumElts / Factor;
1249 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1250 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1251 unsigned NumPermutes = 0;
1252
1253 if (Opcode == Instruction::Load) {
1254 // Loading interleave groups may have gaps, which may mean fewer
1255 // loads. Find out how many vectors will be loaded in total, and in how
1256 // many of them each value will be in.
1257 BitVector UsedInsts(NumVectorMemOps, false);
1258 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1259 for (unsigned Index : Indices)
1260 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1261 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1262 UsedInsts.set(Vec);
1263 ValueVecs[Index].set(Vec);
1264 }
1265 NumVectorMemOps = UsedInsts.count();
1266
1267 for (unsigned Index : Indices) {
1268 // Estimate that each loaded source vector containing this Index
1269 // requires one operation, except that vperm can handle two input
1270 // registers first time for each dst vector.
1271 unsigned NumSrcVecs = ValueVecs[Index].count();
1272 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1273 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1274 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1275 }
1276 } else {
1277 // Estimate the permutes for each stored vector as the smaller of the
1278 // number of elements and the number of source vectors. Subtract one per
1279 // dst vector for vperm (S.A.).
1280 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1281 unsigned NumDstVecs = NumVectorMemOps;
1282 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1283 }
1284
1285 // Cost of load/store operations and the permutations needed.
1286 return NumVectorMemOps + NumPermutes;
1287}
1288
1289static int
1291 const SmallVectorImpl<Type *> &ParamTys) {
1292 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1293 return getNumVectorRegs(RetTy); // VPERM
1294
1295 if (ID == Intrinsic::vector_reduce_add) {
1296 // Retrieve number and size of elements for the vector op.
1297 auto *VTy = cast<FixedVectorType>(ParamTys.front());
1298 unsigned ScalarSize = VTy->getScalarSizeInBits();
1299 // For scalar sizes >128 bits, we fall back to the generic cost estimate.
1300 if (ScalarSize > SystemZ::VectorBits)
1301 return -1;
1302 // This many vector regs are needed to represent the input elements (V).
1303 unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
1304 // This many instructions are needed for the final sum of vector elems (S).
1305 unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
1306 // We use vector adds to create a sum vector, which takes
1307 // V/2 + V/4 + ... = V - 1 operations.
1308 // Then, we need S operations to sum up the elements of that sum vector,
1309 // for a total of V + S - 1 operations.
1310 int Cost = VectorRegsNeeded + LastVectorHandling - 1;
1311 return Cost;
1312 }
1313 return -1;
1314}
1315
1320 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
1321 if (Cost != -1)
1322 return Cost;
1324}
1325
1327 // Always expand on Subtargets without vector instructions.
1328 if (!ST->hasVector())
1329 return true;
1330
1331 // Whether or not to expand is a per-intrinsic decision.
1332 switch (II->getIntrinsicID()) {
1333 default:
1334 return true;
1335 // Do not expand vector.reduce.add...
1336 case Intrinsic::vector_reduce_add:
1337 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
1338 // ...unless the scalar size is i64 or larger,
1339 // or the operand vector is not full, since the
1340 // performance benefit is dubious in those cases.
1341 return VType->getScalarSizeInBits() >= 64 ||
1342 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1343 }
1344}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
const HexagonInstrInfo * TII
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl< Type * > &ParamTys)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:429
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:420
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:768
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:747
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:781
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:765
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:768
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:766
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:785
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:767
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:680
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
An instruction for reading from memory.
Definition: Instructions.h:174
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class wraps the llvm.memcpy intrinsic.
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
An instruction for storing to memory.
Definition: Instructions.h:290
const SystemZInstrInfo * getInstrInfo() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool shouldExpandReduction(const IntrinsicInst *II) const
unsigned getNumberOfRegisters(unsigned ClassID) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool hasDivRemOp(Type *DataType, bool IsSigned)
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:261
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:230
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:343
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const unsigned VectorBits
Definition: SystemZ.h:154
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...