LLVM 20.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/Support/Debug.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "systemztti"
30
31//===----------------------------------------------------------------------===//
32//
33// SystemZ cost model.
34//
35//===----------------------------------------------------------------------===//
36
37static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
38 bool UsedAsMemCpySource = false;
39 for (const User *U : V->users())
40 if (const Instruction *User = dyn_cast<Instruction>(U)) {
41 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
42 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
43 continue;
44 }
45 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
46 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
47 UsedAsMemCpySource = true;
48 continue;
49 }
50 }
51 OtherUse = true;
52 }
53 return UsedAsMemCpySource;
54}
55
56static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
57 unsigned &NumLoads, const Function *F) {
58 if (!isa<PointerType>(Ptr->getType()))
59 return;
60 for (const User *U : Ptr->users())
61 if (const Instruction *User = dyn_cast<Instruction>(U)) {
62 if (User->getParent()->getParent() == F) {
63 if (const auto *SI = dyn_cast<StoreInst>(User)) {
64 if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
65 NumStores++;
66 } else if (const auto *LI = dyn_cast<LoadInst>(User)) {
67 if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
68 NumLoads++;
69 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
70 if (GEP->getPointerOperand() == Ptr)
71 countNumMemAccesses(GEP, NumStores, NumLoads, F);
72 }
73 }
74 }
75}
76
78 unsigned Bonus = 0;
79 const Function *Caller = CB->getParent()->getParent();
80 const Function *Callee = CB->getCalledFunction();
81 if (!Callee)
82 return 0;
83 const Module *M = Caller->getParent();
84
85 // Increase the threshold if an incoming argument is used only as a memcpy
86 // source.
87 for (const Argument &Arg : Callee->args()) {
88 bool OtherUse = false;
89 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
90 Bonus = 1000;
91 break;
92 }
93 }
94
95 // Give bonus for globals used much in both caller and callee.
96 std::set<const GlobalVariable *> CalleeGlobals;
97 std::set<const GlobalVariable *> CallerGlobals;
98 for (const GlobalVariable &Global : M->globals())
99 for (const User *U : Global.users())
100 if (const Instruction *User = dyn_cast<Instruction>(U)) {
101 if (User->getParent()->getParent() == Callee)
102 CalleeGlobals.insert(&Global);
103 if (User->getParent()->getParent() == Caller)
104 CallerGlobals.insert(&Global);
105 }
106 for (auto *GV : CalleeGlobals)
107 if (CallerGlobals.count(GV)) {
108 unsigned CalleeStores = 0, CalleeLoads = 0;
109 unsigned CallerStores = 0, CallerLoads = 0;
110 countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
111 countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
112 if ((CalleeStores + CalleeLoads) > 10 &&
113 (CallerStores + CallerLoads) > 10) {
114 Bonus = 1000;
115 break;
116 }
117 }
118
119 // Give bonus when Callee accesses an Alloca of Caller heavily.
120 unsigned NumStores = 0;
121 unsigned NumLoads = 0;
122 for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
123 Value *CallerArg = CB->getArgOperand(OpIdx);
124 Argument *CalleeArg = Callee->getArg(OpIdx);
125 if (isa<AllocaInst>(CallerArg))
126 countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
127 }
128 if (NumLoads > 10)
129 Bonus += NumLoads * 50;
130 if (NumStores > 10)
131 Bonus += NumStores * 50;
132 Bonus = std::min(Bonus, unsigned(1000));
133
134 LLVM_DEBUG(if (Bonus)
135 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
136 return Bonus;
137}
138
141 assert(Ty->isIntegerTy());
142
143 unsigned BitSize = Ty->getPrimitiveSizeInBits();
144 // There is no cost model for constants with a bit size of 0. Return TCC_Free
145 // here, so that constant hoisting will ignore this constant.
146 if (BitSize == 0)
147 return TTI::TCC_Free;
148 // No cost model for operations on integers larger than 128 bit implemented yet.
149 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
150 return TTI::TCC_Free;
151
152 if (Imm == 0)
153 return TTI::TCC_Free;
154
155 if (Imm.getBitWidth() <= 64) {
156 // Constants loaded via lgfi.
157 if (isInt<32>(Imm.getSExtValue()))
158 return TTI::TCC_Basic;
159 // Constants loaded via llilf.
160 if (isUInt<32>(Imm.getZExtValue()))
161 return TTI::TCC_Basic;
162 // Constants loaded via llihf:
163 if ((Imm.getZExtValue() & 0xffffffff) == 0)
164 return TTI::TCC_Basic;
165
166 return 2 * TTI::TCC_Basic;
167 }
168
169 // i128 immediates loads from Constant Pool
170 return 2 * TTI::TCC_Basic;
171}
172
174 const APInt &Imm, Type *Ty,
176 Instruction *Inst) {
177 assert(Ty->isIntegerTy());
178
179 unsigned BitSize = Ty->getPrimitiveSizeInBits();
180 // There is no cost model for constants with a bit size of 0. Return TCC_Free
181 // here, so that constant hoisting will ignore this constant.
182 if (BitSize == 0)
183 return TTI::TCC_Free;
184 // No cost model for operations on integers larger than 64 bit implemented yet.
185 if (BitSize > 64)
186 return TTI::TCC_Free;
187
188 switch (Opcode) {
189 default:
190 return TTI::TCC_Free;
191 case Instruction::GetElementPtr:
192 // Always hoist the base address of a GetElementPtr. This prevents the
193 // creation of new constants for every base constant that gets constant
194 // folded with the offset.
195 if (Idx == 0)
196 return 2 * TTI::TCC_Basic;
197 return TTI::TCC_Free;
198 case Instruction::Store:
199 if (Idx == 0 && Imm.getBitWidth() <= 64) {
200 // Any 8-bit immediate store can by implemented via mvi.
201 if (BitSize == 8)
202 return TTI::TCC_Free;
203 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
204 if (isInt<16>(Imm.getSExtValue()))
205 return TTI::TCC_Free;
206 }
207 break;
208 case Instruction::ICmp:
209 if (Idx == 1 && Imm.getBitWidth() <= 64) {
210 // Comparisons against signed 32-bit immediates implemented via cgfi.
211 if (isInt<32>(Imm.getSExtValue()))
212 return TTI::TCC_Free;
213 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
214 if (isUInt<32>(Imm.getZExtValue()))
215 return TTI::TCC_Free;
216 }
217 break;
218 case Instruction::Add:
219 case Instruction::Sub:
220 if (Idx == 1 && Imm.getBitWidth() <= 64) {
221 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
222 if (isUInt<32>(Imm.getZExtValue()))
223 return TTI::TCC_Free;
224 // Or their negation, by swapping addition vs. subtraction.
225 if (isUInt<32>(-Imm.getSExtValue()))
226 return TTI::TCC_Free;
227 }
228 break;
229 case Instruction::Mul:
230 if (Idx == 1 && Imm.getBitWidth() <= 64) {
231 // We use msgfi to multiply by 32-bit signed immediates.
232 if (isInt<32>(Imm.getSExtValue()))
233 return TTI::TCC_Free;
234 }
235 break;
236 case Instruction::Or:
237 case Instruction::Xor:
238 if (Idx == 1 && Imm.getBitWidth() <= 64) {
239 // Masks supported by oilf/xilf.
240 if (isUInt<32>(Imm.getZExtValue()))
241 return TTI::TCC_Free;
242 // Masks supported by oihf/xihf.
243 if ((Imm.getZExtValue() & 0xffffffff) == 0)
244 return TTI::TCC_Free;
245 }
246 break;
247 case Instruction::And:
248 if (Idx == 1 && Imm.getBitWidth() <= 64) {
249 // Any 32-bit AND operation can by implemented via nilf.
250 if (BitSize <= 32)
251 return TTI::TCC_Free;
252 // 64-bit masks supported by nilf.
253 if (isUInt<32>(~Imm.getZExtValue()))
254 return TTI::TCC_Free;
255 // 64-bit masks supported by nilh.
256 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
257 return TTI::TCC_Free;
258 // Some 64-bit AND operations can be implemented via risbg.
259 const SystemZInstrInfo *TII = ST->getInstrInfo();
260 unsigned Start, End;
261 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
262 return TTI::TCC_Free;
263 }
264 break;
265 case Instruction::Shl:
266 case Instruction::LShr:
267 case Instruction::AShr:
268 // Always return TCC_Free for the shift value of a shift instruction.
269 if (Idx == 1)
270 return TTI::TCC_Free;
271 break;
272 case Instruction::UDiv:
273 case Instruction::SDiv:
274 case Instruction::URem:
275 case Instruction::SRem:
276 case Instruction::Trunc:
277 case Instruction::ZExt:
278 case Instruction::SExt:
279 case Instruction::IntToPtr:
280 case Instruction::PtrToInt:
281 case Instruction::BitCast:
282 case Instruction::PHI:
283 case Instruction::Call:
284 case Instruction::Select:
285 case Instruction::Ret:
286 case Instruction::Load:
287 break;
288 }
289
291}
292
295 const APInt &Imm, Type *Ty,
297 assert(Ty->isIntegerTy());
298
299 unsigned BitSize = Ty->getPrimitiveSizeInBits();
300 // There is no cost model for constants with a bit size of 0. Return TCC_Free
301 // here, so that constant hoisting will ignore this constant.
302 if (BitSize == 0)
303 return TTI::TCC_Free;
304 // No cost model for operations on integers larger than 64 bit implemented yet.
305 if (BitSize > 64)
306 return TTI::TCC_Free;
307
308 switch (IID) {
309 default:
310 return TTI::TCC_Free;
311 case Intrinsic::sadd_with_overflow:
312 case Intrinsic::uadd_with_overflow:
313 case Intrinsic::ssub_with_overflow:
314 case Intrinsic::usub_with_overflow:
315 // These get expanded to include a normal addition/subtraction.
316 if (Idx == 1 && Imm.getBitWidth() <= 64) {
317 if (isUInt<32>(Imm.getZExtValue()))
318 return TTI::TCC_Free;
319 if (isUInt<32>(-Imm.getSExtValue()))
320 return TTI::TCC_Free;
321 }
322 break;
323 case Intrinsic::smul_with_overflow:
324 case Intrinsic::umul_with_overflow:
325 // These get expanded to include a normal multiplication.
326 if (Idx == 1 && Imm.getBitWidth() <= 64) {
327 if (isInt<32>(Imm.getSExtValue()))
328 return TTI::TCC_Free;
329 }
330 break;
331 case Intrinsic::experimental_stackmap:
332 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
333 return TTI::TCC_Free;
334 break;
335 case Intrinsic::experimental_patchpoint_void:
336 case Intrinsic::experimental_patchpoint:
337 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
338 return TTI::TCC_Free;
339 break;
340 }
342}
343
346 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
347 if (ST->hasPopulationCount() && TyWidth <= 64)
349 return TTI::PSK_Software;
350}
351
355 // Find out if L contains a call, what the machine instruction count
356 // estimate is, and how many stores there are.
357 bool HasCall = false;
358 InstructionCost NumStores = 0;
359 for (auto &BB : L->blocks())
360 for (auto &I : *BB) {
361 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
362 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
363 if (isLoweredToCall(F))
364 HasCall = true;
365 if (F->getIntrinsicID() == Intrinsic::memcpy ||
366 F->getIntrinsicID() == Intrinsic::memset)
367 NumStores++;
368 } else { // indirect call.
369 HasCall = true;
370 }
371 }
372 if (isa<StoreInst>(&I)) {
373 Type *MemAccessTy = I.getOperand(0)->getType();
374 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
375 std::nullopt, 0, TTI::TCK_RecipThroughput);
376 }
377 }
378
379 // The z13 processor will run out of store tags if too many stores
380 // are fed into it too quickly. Therefore make sure there are not
381 // too many stores in the resulting unrolled loop.
382 unsigned const NumStoresVal = *NumStores.getValue();
383 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
384
385 if (HasCall) {
386 // Only allow full unrolling if loop has any calls.
387 UP.FullUnrollMaxCount = Max;
388 UP.MaxCount = 1;
389 return;
390 }
391
392 UP.MaxCount = Max;
393 if (UP.MaxCount <= 1)
394 return;
395
396 // Allow partial and runtime trip count unrolling.
397 UP.Partial = UP.Runtime = true;
398
399 UP.PartialThreshold = 75;
401
402 // Allow expensive instructions in the pre-header of the loop.
403 UP.AllowExpensiveTripCount = true;
404
405 UP.Force = true;
406}
407
411}
412
415 // SystemZ specific: check instruction count (first), and don't care about
416 // ImmCost, since offsets are checked explicitly.
417 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
418 C1.NumIVMuls, C1.NumBaseAdds,
419 C1.ScaleCost, C1.SetupCost) <
420 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
421 C2.NumIVMuls, C2.NumBaseAdds,
422 C2.ScaleCost, C2.SetupCost);
423}
424
425unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
426 bool Vector = (ClassID == 1);
427 if (!Vector)
428 // Discount the stack pointer. Also leave out %r0, since it can't
429 // be used in an address.
430 return 14;
431 if (ST->hasVector())
432 return 32;
433 return 0;
434}
435
438 switch (K) {
440 return TypeSize::getFixed(64);
442 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
444 return TypeSize::getScalable(0);
445 }
446
447 llvm_unreachable("Unsupported register kind");
448}
449
450unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
451 unsigned NumStridedMemAccesses,
452 unsigned NumPrefetches,
453 bool HasCall) const {
454 // Don't prefetch a loop with many far apart accesses.
455 if (NumPrefetches > 16)
456 return UINT_MAX;
457
458 // Emit prefetch instructions for smaller strides in cases where we think
459 // the hardware prefetcher might not be able to keep up.
460 if (NumStridedMemAccesses > 32 && !HasCall &&
461 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
462 return 1;
463
464 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
465}
466
467bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
468 EVT VT = TLI->getValueType(DL, DataType);
469 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
470}
471
472static bool isFreeEltLoad(Value *Op) {
473 if (isa<LoadInst>(Op) && Op->hasOneUse()) {
474 const Instruction *UserI = cast<Instruction>(*Op->user_begin());
475 return !isa<StoreInst>(UserI); // Prefer MVC
476 }
477 return false;
478}
479
481 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
483 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
485
486 if (Insert && Ty->isIntOrIntVectorTy(64)) {
487 // VLVGP will insert two GPRs with one instruction, while VLE will load
488 // an element directly with no extra cost
489 assert((VL.empty() || VL.size() == NumElts) &&
490 "Type does not match the number of values.");
491 InstructionCost CurrVectorCost = 0;
492 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
493 if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx])))
494 ++CurrVectorCost;
495 if (Idx % 2 == 1) {
496 Cost += std::min(InstructionCost(1), CurrVectorCost);
497 CurrVectorCost = 0;
498 }
499 }
500 Insert = false;
501 }
502
503 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
504 CostKind, VL);
505 return Cost;
506}
507
508// Return the bit size for the scalar type or vector element
509// type. getScalarSizeInBits() returns 0 for a pointer type.
510static unsigned getScalarSizeInBits(Type *Ty) {
511 unsigned Size =
512 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
513 assert(Size > 0 && "Element must have non-zero size.");
514 return Size;
515}
516
517// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
518// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
519// 3.
520static unsigned getNumVectorRegs(Type *Ty) {
521 auto *VTy = cast<FixedVectorType>(Ty);
522 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
523 assert(WideBits > 0 && "Could not compute size of vector");
524 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
525}
526
528 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531 const Instruction *CxtI) {
532
533 // TODO: Handle more cost kinds.
535 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
536 Op2Info, Args, CxtI);
537
538 // TODO: return a good value for BB-VECTORIZER that includes the
539 // immediate loads, which we do not want to count for the loop
540 // vectorizer, since they are hopefully hoisted out of the loop. This
541 // would require a new parameter 'InLoop', but not sure if constant
542 // args are common enough to motivate this.
543
544 unsigned ScalarBits = Ty->getScalarSizeInBits();
545
546 // There are thre cases of division and remainder: Dividing with a register
547 // needs a divide instruction. A divisor which is a power of two constant
548 // can be implemented with a sequence of shifts. Any other constant needs a
549 // multiply and shifts.
550 const unsigned DivInstrCost = 20;
551 const unsigned DivMulSeqCost = 10;
552 const unsigned SDivPow2Cost = 4;
553
554 bool SignedDivRem =
555 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
556 bool UnsignedDivRem =
557 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
558
559 // Check for a constant divisor.
560 bool DivRemConst = false;
561 bool DivRemConstPow2 = false;
562 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
563 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
564 const ConstantInt *CVal =
565 (C->getType()->isVectorTy()
566 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
567 : dyn_cast<const ConstantInt>(C));
568 if (CVal && (CVal->getValue().isPowerOf2() ||
569 CVal->getValue().isNegatedPowerOf2()))
570 DivRemConstPow2 = true;
571 else
572 DivRemConst = true;
573 }
574 }
575
576 if (!Ty->isVectorTy()) {
577 // These FP operations are supported with a dedicated instruction for
578 // float, double and fp128 (base implementation assumes float generally
579 // costs 2).
580 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
581 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
582 return 1;
583
584 // There is no native support for FRem.
585 if (Opcode == Instruction::FRem)
586 return LIBCALL_COST;
587
588 // Give discount for some combined logical operations if supported.
589 if (Args.size() == 2) {
590 if (Opcode == Instruction::Xor) {
591 for (const Value *A : Args) {
592 if (const Instruction *I = dyn_cast<Instruction>(A))
593 if (I->hasOneUse() &&
594 (I->getOpcode() == Instruction::Or ||
595 I->getOpcode() == Instruction::And ||
596 I->getOpcode() == Instruction::Xor))
597 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
598 (isInt128InVR(Ty) &&
599 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
600 return 0;
601 }
602 }
603 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
604 for (const Value *A : Args) {
605 if (const Instruction *I = dyn_cast<Instruction>(A))
606 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
607 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
608 (isInt128InVR(Ty) &&
609 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
610 return 0;
611 }
612 }
613 }
614
615 // Or requires one instruction, although it has custom handling for i64.
616 if (Opcode == Instruction::Or)
617 return 1;
618
619 if (Opcode == Instruction::Xor && ScalarBits == 1) {
620 if (ST->hasLoadStoreOnCond2())
621 return 5; // 2 * (li 0; loc 1); xor
622 return 7; // 2 * ipm sequences ; xor ; shift ; compare
623 }
624
625 if (DivRemConstPow2)
626 return (SignedDivRem ? SDivPow2Cost : 1);
627 if (DivRemConst)
628 return DivMulSeqCost;
629 if (SignedDivRem || UnsignedDivRem)
630 return DivInstrCost;
631 }
632 else if (ST->hasVector()) {
633 auto *VTy = cast<FixedVectorType>(Ty);
634 unsigned VF = VTy->getNumElements();
635 unsigned NumVectors = getNumVectorRegs(Ty);
636
637 // These vector operations are custom handled, but are still supported
638 // with one instruction per vector, regardless of element size.
639 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
640 Opcode == Instruction::AShr) {
641 return NumVectors;
642 }
643
644 if (DivRemConstPow2)
645 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
646 if (DivRemConst) {
647 SmallVector<Type *> Tys(Args.size(), Ty);
648 return VF * DivMulSeqCost +
650 }
651 if ((SignedDivRem || UnsignedDivRem) && VF > 4)
652 // Temporary hack: disable high vectorization factors with integer
653 // division/remainder, which will get scalarized and handled with
654 // GR128 registers. The mischeduler is not clever enough to avoid
655 // spilling yet.
656 return 1000;
657
658 // These FP operations are supported with a single vector instruction for
659 // double (base implementation assumes float generally costs 2). For
660 // FP128, the scalar cost is 1, and there is no overhead since the values
661 // are already in scalar registers.
662 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
663 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
664 switch (ScalarBits) {
665 case 32: {
666 // The vector enhancements facility 1 provides v4f32 instructions.
667 if (ST->hasVectorEnhancements1())
668 return NumVectors;
669 // Return the cost of multiple scalar invocation plus the cost of
670 // inserting and extracting the values.
671 InstructionCost ScalarCost =
673 SmallVector<Type *> Tys(Args.size(), Ty);
675 (VF * ScalarCost) +
677 // FIXME: VF 2 for these FP operations are currently just as
678 // expensive as for VF 4.
679 if (VF == 2)
680 Cost *= 2;
681 return Cost;
682 }
683 case 64:
684 case 128:
685 return NumVectors;
686 default:
687 break;
688 }
689 }
690
691 // There is no native support for FRem.
692 if (Opcode == Instruction::FRem) {
693 SmallVector<Type *> Tys(Args.size(), Ty);
695 (VF * LIBCALL_COST) +
697 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
698 if (VF == 2 && ScalarBits == 32)
699 Cost *= 2;
700 return Cost;
701 }
702 }
703
704 // Fallback to the default implementation.
705 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
706 Args, CxtI);
707}
708
711 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
712 ArrayRef<const Value *> Args, const Instruction *CxtI) {
713 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
714 if (ST->hasVector()) {
715 unsigned NumVectors = getNumVectorRegs(Tp);
716
717 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
718
719 // FP128 values are always in scalar registers, so there is no work
720 // involved with a shuffle, except for broadcast. In that case register
721 // moves are done with a single instruction per element.
722 if (Tp->getScalarType()->isFP128Ty())
723 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
724
725 switch (Kind) {
727 // ExtractSubvector Index indicates start offset.
728
729 // Extracting a subvector from first index is a noop.
730 return (Index == 0 ? 0 : NumVectors);
731
733 // Loop vectorizer calls here to figure out the extra cost of
734 // broadcasting a loaded value to all elements of a vector. Since vlrep
735 // loads and replicates with a single instruction, adjust the returned
736 // value.
737 return NumVectors - 1;
738
739 default:
740
741 // SystemZ supports single instruction permutation / replication.
742 return NumVectors;
743 }
744 }
745
746 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
747}
748
749// Return the log2 difference of the element sizes of the two vector types.
750static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
751 unsigned Bits0 = Ty0->getScalarSizeInBits();
752 unsigned Bits1 = Ty1->getScalarSizeInBits();
753
754 if (Bits1 > Bits0)
755 return (Log2_32(Bits1) - Log2_32(Bits0));
756
757 return (Log2_32(Bits0) - Log2_32(Bits1));
758}
759
760// Return the number of instructions needed to truncate SrcTy to DstTy.
762getVectorTruncCost(Type *SrcTy, Type *DstTy) {
763 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
766 "Packing must reduce size of vector type.");
767 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
768 cast<FixedVectorType>(DstTy)->getNumElements() &&
769 "Packing should not change number of elements.");
770
771 // TODO: Since fp32 is expanded, the extract cost should always be 0.
772
773 unsigned NumParts = getNumVectorRegs(SrcTy);
774 if (NumParts <= 2)
775 // Up to 2 vector registers can be truncated efficiently with pack or
776 // permute. The latter requires an immediate mask to be loaded, which
777 // typically gets hoisted out of a loop. TODO: return a good value for
778 // BB-VECTORIZER that includes the immediate loads, which we do not want
779 // to count for the loop vectorizer.
780 return 1;
781
782 unsigned Cost = 0;
783 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
784 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
785 for (unsigned P = 0; P < Log2Diff; ++P) {
786 if (NumParts > 1)
787 NumParts /= 2;
788 Cost += NumParts;
789 }
790
791 // Currently, a general mix of permutes and pack instructions is output by
792 // isel, which follow the cost computation above except for this case which
793 // is one instruction less:
794 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
795 DstTy->getScalarSizeInBits() == 8)
796 Cost--;
797
798 return Cost;
799}
800
801// Return the cost of converting a vector bitmask produced by a compare
802// (SrcTy), to the type of the select or extend instruction (DstTy).
805 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
806 "Should only be called with vector types.");
807
808 unsigned PackCost = 0;
809 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
810 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
811 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
812 if (SrcScalarBits > DstScalarBits)
813 // The bitmask will be truncated.
814 PackCost = getVectorTruncCost(SrcTy, DstTy);
815 else if (SrcScalarBits < DstScalarBits) {
816 unsigned DstNumParts = getNumVectorRegs(DstTy);
817 // Each vector select needs its part of the bitmask unpacked.
818 PackCost = Log2Diff * DstNumParts;
819 // Extra cost for moving part of mask before unpacking.
820 PackCost += DstNumParts - 1;
821 }
822
823 return PackCost;
824}
825
826// Return the type of the compared operands. This is needed to compute the
827// cost for a Select / ZExt or SExt instruction.
828static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
829 Type *OpTy = nullptr;
830 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
831 OpTy = CI->getOperand(0)->getType();
832 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
833 if (LogicI->getNumOperands() == 2)
834 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
835 if (isa<CmpInst>(LogicI->getOperand(1)))
836 OpTy = CI0->getOperand(0)->getType();
837
838 if (OpTy != nullptr) {
839 if (VF == 1) {
840 assert (!OpTy->isVectorTy() && "Expected scalar type");
841 return OpTy;
842 }
843 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
844 // be either scalar or already vectorized with a same or lesser VF.
845 Type *ElTy = OpTy->getScalarType();
846 return FixedVectorType::get(ElTy, VF);
847 }
848
849 return nullptr;
850}
851
852// Get the cost of converting a boolean vector to a vector with same width
853// and element size as Dst, plus the cost of zero extending if needed.
855getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
856 const Instruction *I) {
857 auto *DstVTy = cast<FixedVectorType>(Dst);
858 unsigned VF = DstVTy->getNumElements();
859 unsigned Cost = 0;
860 // If we know what the widths of the compared operands, get any cost of
861 // converting it to match Dst. Otherwise assume same widths.
862 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
863 if (CmpOpTy != nullptr)
864 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
865 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
866 // One 'vn' per dst vector with an immediate mask.
867 Cost += getNumVectorRegs(Dst);
868 return Cost;
869}
870
872 Type *Src,
875 const Instruction *I) {
876 // FIXME: Can the logic below also be used for these cost kinds?
878 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
879 return BaseCost == 0 ? BaseCost : 1;
880 }
881
882 unsigned DstScalarBits = Dst->getScalarSizeInBits();
883 unsigned SrcScalarBits = Src->getScalarSizeInBits();
884
885 if (!Src->isVectorTy()) {
886 assert (!Dst->isVectorTy());
887
888 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
889 if (Src->isIntegerTy(128))
890 return LIBCALL_COST;
891 if (SrcScalarBits >= 32 ||
892 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
893 return 1;
894 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
895 }
896
897 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
898 Dst->isIntegerTy(128))
899 return LIBCALL_COST;
900
901 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
902 if (Src->isIntegerTy(1)) {
903 if (DstScalarBits == 128)
904 return 5 /*branch seq.*/;
905
906 if (ST->hasLoadStoreOnCond2())
907 return 2; // li 0; loc 1
908
909 // This should be extension of a compare i1 result, which is done with
910 // ipm and a varying sequence of instructions.
911 unsigned Cost = 0;
912 if (Opcode == Instruction::SExt)
913 Cost = (DstScalarBits < 64 ? 3 : 4);
914 if (Opcode == Instruction::ZExt)
915 Cost = 3;
916 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
917 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
918 // If operands of an fp-type was compared, this costs +1.
919 Cost++;
920 return Cost;
921 }
922 else if (isInt128InVR(Dst)) {
923 // Extensions from GPR to i128 (in VR) typically costs two instructions,
924 // but a zero-extending load would be just one extra instruction.
925 if (Opcode == Instruction::ZExt && I != nullptr)
926 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
927 if (Ld->hasOneUse())
928 return 1;
929 return 2;
930 }
931 }
932
933 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
934 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
935 if (Ld->hasOneUse())
936 return 0; // Will be converted to GPR load.
937 bool OnlyTruncatingStores = true;
938 for (const User *U : I->users())
939 if (!isa<StoreInst>(U)) {
940 OnlyTruncatingStores = false;
941 break;
942 }
943 if (OnlyTruncatingStores)
944 return 0;
945 return 2; // Vector element extraction.
946 }
947 }
948 else if (ST->hasVector()) {
949 // Vector to scalar cast.
950 auto *SrcVecTy = cast<FixedVectorType>(Src);
951 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
952 if (!DstVecTy) {
953 // TODO: tune vector-to-scalar cast.
954 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
955 }
956 unsigned VF = SrcVecTy->getNumElements();
957 unsigned NumDstVectors = getNumVectorRegs(Dst);
958 unsigned NumSrcVectors = getNumVectorRegs(Src);
959
960 if (Opcode == Instruction::Trunc) {
961 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
962 return 0; // Check for NOOP conversions.
963 return getVectorTruncCost(Src, Dst);
964 }
965
966 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
967 if (SrcScalarBits >= 8) {
968 // ZExt will use either a single unpack or a vector permute.
969 if (Opcode == Instruction::ZExt)
970 return NumDstVectors;
971
972 // SExt will be handled with one unpack per doubling of width.
973 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
974
975 // For types that spans multiple vector registers, some additional
976 // instructions are used to setup the unpacking.
977 unsigned NumSrcVectorOps =
978 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
979 : (NumDstVectors / 2));
980
981 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
982 }
983 else if (SrcScalarBits == 1)
984 return getBoolVecToIntConversionCost(Opcode, Dst, I);
985 }
986
987 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
988 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
989 // TODO: Fix base implementation which could simplify things a bit here
990 // (seems to miss on differentiating on scalar/vector types).
991
992 // Only 64 bit vector conversions are natively supported before z15.
993 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
994 if (SrcScalarBits == DstScalarBits)
995 return NumDstVectors;
996
997 if (SrcScalarBits == 1)
998 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
999 }
1000
1001 // Return the cost of multiple scalar invocation plus the cost of
1002 // inserting and extracting the values. Base implementation does not
1003 // realize float->int gets scalarized.
1004 InstructionCost ScalarCost = getCastInstrCost(
1005 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
1006 InstructionCost TotCost = VF * ScalarCost;
1007 bool NeedsInserts = true, NeedsExtracts = true;
1008 // FP128 registers do not get inserted or extracted.
1009 if (DstScalarBits == 128 &&
1010 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
1011 NeedsInserts = false;
1012 if (SrcScalarBits == 128 &&
1013 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
1014 NeedsExtracts = false;
1015
1016 TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1017 NeedsExtracts, CostKind);
1018 TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
1019 /*Extract*/ false, CostKind);
1020
1021 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1022 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
1023 TotCost *= 2;
1024
1025 return TotCost;
1026 }
1027
1028 if (Opcode == Instruction::FPTrunc) {
1029 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
1030 return VF /*ldxbr/lexbr*/ +
1031 BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
1032 /*Extract*/ false, CostKind);
1033 else // double -> float
1034 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
1035 }
1036
1037 if (Opcode == Instruction::FPExt) {
1038 if (SrcScalarBits == 32 && DstScalarBits == 64) {
1039 // float -> double is very rare and currently unoptimized. Instead of
1040 // using vldeb, which can do two at a time, all conversions are
1041 // scalarized.
1042 return VF * 2;
1043 }
1044 // -> fp128. VF * lxdb/lxeb + extraction of elements.
1045 return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1046 /*Extract*/ true, CostKind);
1047 }
1048 }
1049
1050 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1051}
1052
1053// Scalar i8 / i16 operations will typically be made after first extending
1054// the operands to i32.
1055static unsigned getOperandsExtensionCost(const Instruction *I) {
1056 unsigned ExtCost = 0;
1057 for (Value *Op : I->operands())
1058 // A load of i8 or i16 sign/zero extends to i32.
1059 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
1060 ExtCost++;
1061
1062 return ExtCost;
1063}
1064
1066 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1068 TTI::OperandValueInfo Op2Info, const Instruction *I) {
1070 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1071 Op1Info, Op2Info);
1072
1073 if (!ValTy->isVectorTy()) {
1074 switch (Opcode) {
1075 case Instruction::ICmp: {
1076 // A loaded value compared with 0 with multiple users becomes Load and
1077 // Test. The load is then not foldable, so return 0 cost for the ICmp.
1078 unsigned ScalarBits = ValTy->getScalarSizeInBits();
1079 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
1080 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
1081 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
1082 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
1083 C->isZero())
1084 return 0;
1085
1086 unsigned Cost = 1;
1087 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
1088 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
1089 return Cost;
1090 }
1091 case Instruction::Select:
1092 if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
1093 return 4; // No LOC for FP / i128 - costs a conditional jump.
1094 return 1; // Load On Condition / Select Register.
1095 }
1096 }
1097 else if (ST->hasVector()) {
1098 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
1099
1100 // Called with a compare instruction.
1101 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
1102 unsigned PredicateExtraCost = 0;
1103 if (I != nullptr) {
1104 // Some predicates cost one or two extra instructions.
1105 switch (cast<CmpInst>(I)->getPredicate()) {
1111 PredicateExtraCost = 1;
1112 break;
1117 PredicateExtraCost = 2;
1118 break;
1119 default:
1120 break;
1121 }
1122 }
1123
1124 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1125 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1126 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1127 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1128
1129 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1130 return Cost;
1131 }
1132 else { // Called with a select instruction.
1133 assert (Opcode == Instruction::Select);
1134
1135 // We can figure out the extra cost of packing / unpacking if the
1136 // instruction was passed and the compare instruction is found.
1137 unsigned PackCost = 0;
1138 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1139 if (CmpOpTy != nullptr)
1140 PackCost =
1141 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1142
1143 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1144 }
1145 }
1146
1147 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1148 Op1Info, Op2Info);
1149}
1150
1153 unsigned Index, Value *Op0,
1154 Value *Op1) {
1155 if (Opcode == Instruction::InsertElement) {
1156 // Vector Element Load.
1157 if (Op1 != nullptr && isFreeEltLoad(Op1))
1158 return 0;
1159
1160 // vlvgp will insert two grs into a vector register, so count half the
1161 // number of instructions as an estimate when we don't have the full
1162 // picture (as in getScalarizationOverhead()).
1163 if (Val->isIntOrIntVectorTy(64))
1164 return ((Index % 2 == 0) ? 1 : 0);
1165 }
1166
1167 if (Opcode == Instruction::ExtractElement) {
1168 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1169
1170 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1171 if (Index == 0 && Val->isIntOrIntVectorTy())
1172 Cost += 1;
1173
1174 return Cost;
1175 }
1176
1177 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1178}
1179
1180// Check if a load may be folded as a memory operand in its user.
1182isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1183 if (!Ld->hasOneUse())
1184 return false;
1185 FoldedValue = Ld;
1186 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1187 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1188 unsigned TruncBits = 0;
1189 unsigned SExtBits = 0;
1190 unsigned ZExtBits = 0;
1191 if (UserI->hasOneUse()) {
1192 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1193 if (isa<TruncInst>(UserI))
1194 TruncBits = UserBits;
1195 else if (isa<SExtInst>(UserI))
1196 SExtBits = UserBits;
1197 else if (isa<ZExtInst>(UserI))
1198 ZExtBits = UserBits;
1199 }
1200 if (TruncBits || SExtBits || ZExtBits) {
1201 FoldedValue = UserI;
1202 UserI = cast<Instruction>(*UserI->user_begin());
1203 // Load (single use) -> trunc/extend (single use) -> UserI
1204 }
1205 if ((UserI->getOpcode() == Instruction::Sub ||
1206 UserI->getOpcode() == Instruction::SDiv ||
1207 UserI->getOpcode() == Instruction::UDiv) &&
1208 UserI->getOperand(1) != FoldedValue)
1209 return false; // Not commutative, only RHS foldable.
1210 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1211 // extension was made of the load.
1212 unsigned LoadOrTruncBits =
1213 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1214 switch (UserI->getOpcode()) {
1215 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1216 case Instruction::Sub:
1217 case Instruction::ICmp:
1218 if (LoadedBits == 32 && ZExtBits == 64)
1219 return true;
1220 [[fallthrough]];
1221 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1222 if (UserI->getOpcode() != Instruction::ICmp) {
1223 if (LoadedBits == 16 &&
1224 (SExtBits == 32 ||
1225 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1226 return true;
1227 if (LoadOrTruncBits == 16)
1228 return true;
1229 }
1230 [[fallthrough]];
1231 case Instruction::SDiv:// SE: 32->64
1232 if (LoadedBits == 32 && SExtBits == 64)
1233 return true;
1234 [[fallthrough]];
1235 case Instruction::UDiv:
1236 case Instruction::And:
1237 case Instruction::Or:
1238 case Instruction::Xor:
1239 // This also makes sense for float operations, but disabled for now due
1240 // to regressions.
1241 // case Instruction::FCmp:
1242 // case Instruction::FAdd:
1243 // case Instruction::FSub:
1244 // case Instruction::FMul:
1245 // case Instruction::FDiv:
1246
1247 // All possible extensions of memory checked above.
1248
1249 // Comparison between memory and immediate.
1250 if (UserI->getOpcode() == Instruction::ICmp)
1251 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1252 if (CI->getValue().isIntN(16))
1253 return true;
1254 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1255 break;
1256 }
1257 return false;
1258}
1259
1260static bool isBswapIntrinsicCall(const Value *V) {
1261 if (const Instruction *I = dyn_cast<Instruction>(V))
1262 if (auto *CI = dyn_cast<CallInst>(I))
1263 if (auto *F = CI->getCalledFunction())
1264 if (F->getIntrinsicID() == Intrinsic::bswap)
1265 return true;
1266 return false;
1267}
1268
1270 MaybeAlign Alignment,
1271 unsigned AddressSpace,
1273 TTI::OperandValueInfo OpInfo,
1274 const Instruction *I) {
1275 assert(!Src->isVoidTy() && "Invalid type");
1276
1277 // TODO: Handle other cost kinds.
1279 return 1;
1280
1281 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1282 // Store the load or its truncated or extended value in FoldedValue.
1283 const Instruction *FoldedValue = nullptr;
1284 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1285 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1286 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1287
1288 // UserI can't fold two loads, so in that case return 0 cost only
1289 // half of the time.
1290 for (unsigned i = 0; i < 2; ++i) {
1291 if (UserI->getOperand(i) == FoldedValue)
1292 continue;
1293
1294 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1295 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1296 if (!OtherLoad &&
1297 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1298 isa<ZExtInst>(OtherOp)))
1299 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1300 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1301 return i == 0; // Both operands foldable.
1302 }
1303 }
1304
1305 return 0; // Only I is foldable in user.
1306 }
1307 }
1308
1309 // Type legalization (via getNumberOfParts) can't handle structs
1310 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1311 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1312 CostKind);
1313
1314 // FP128 is a legal type but kept in a register pair on older CPUs.
1315 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1316 return 2;
1317
1318 unsigned NumOps =
1319 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1320
1321 // Store/Load reversed saves one instruction.
1322 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1323 I != nullptr) {
1324 if (Opcode == Instruction::Load && I->hasOneUse()) {
1325 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1326 // In case of load -> bswap -> store, return normal cost for the load.
1327 if (isBswapIntrinsicCall(LdUser) &&
1328 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1329 return 0;
1330 }
1331 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1332 const Value *StoredVal = SI->getValueOperand();
1333 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1334 return 0;
1335 }
1336 }
1337
1338 return NumOps;
1339}
1340
1341// The generic implementation of getInterleavedMemoryOpCost() is based on
1342// adding costs of the memory operations plus all the extracts and inserts
1343// needed for using / defining the vector operands. The SystemZ version does
1344// roughly the same but bases the computations on vector permutations
1345// instead.
1347 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1348 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1349 bool UseMaskForCond, bool UseMaskForGaps) {
1350 if (UseMaskForCond || UseMaskForGaps)
1351 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1352 Alignment, AddressSpace, CostKind,
1353 UseMaskForCond, UseMaskForGaps);
1354 assert(isa<VectorType>(VecTy) &&
1355 "Expect a vector type for interleaved memory op");
1356
1357 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1358 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1359 unsigned VF = NumElts / Factor;
1360 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1361 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1362 unsigned NumPermutes = 0;
1363
1364 if (Opcode == Instruction::Load) {
1365 // Loading interleave groups may have gaps, which may mean fewer
1366 // loads. Find out how many vectors will be loaded in total, and in how
1367 // many of them each value will be in.
1368 BitVector UsedInsts(NumVectorMemOps, false);
1369 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1370 for (unsigned Index : Indices)
1371 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1372 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1373 UsedInsts.set(Vec);
1374 ValueVecs[Index].set(Vec);
1375 }
1376 NumVectorMemOps = UsedInsts.count();
1377
1378 for (unsigned Index : Indices) {
1379 // Estimate that each loaded source vector containing this Index
1380 // requires one operation, except that vperm can handle two input
1381 // registers first time for each dst vector.
1382 unsigned NumSrcVecs = ValueVecs[Index].count();
1383 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1384 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1385 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1386 }
1387 } else {
1388 // Estimate the permutes for each stored vector as the smaller of the
1389 // number of elements and the number of source vectors. Subtract one per
1390 // dst vector for vperm (S.A.).
1391 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1392 unsigned NumDstVecs = NumVectorMemOps;
1393 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1394 }
1395
1396 // Cost of load/store operations and the permutations needed.
1397 return NumVectorMemOps + NumPermutes;
1398}
1399
1400InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1402 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1403 Cost += NumVec - 1;
1404 // For integer adds, VSUM creates shorter reductions on the final vector.
1405 Cost += (ScalarBits < 32) ? 3 : 2;
1406 return Cost;
1407}
1408
1409InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1410 unsigned ScalarBits) {
1411 unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1413 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1414 Cost += NumVec - 1;
1415 // For each shuffle / arithmetic layer, we need 2 instructions, and we need
1416 // log2(Elements in Last Vector) layers.
1417 Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg));
1418 return Cost;
1419}
1420
1421inline bool customCostReductions(unsigned Opcode) {
1422 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1423 Opcode == Instruction::Add || Opcode == Instruction::Mul;
1424}
1425
1428 std::optional<FastMathFlags> FMF,
1430 unsigned ScalarBits = Ty->getScalarSizeInBits();
1431 // The following is only for subtargets with vector math, non-ordered
1432 // reductions, and reasonable scalar sizes for int and fp add/mul.
1433 if (customCostReductions(Opcode) && ST->hasVector() &&
1435 ScalarBits <= SystemZ::VectorBits) {
1436 unsigned NumVectors = getNumVectorRegs(Ty);
1437 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1438 // Integer Add is using custom code gen, that needs to be accounted for.
1439 if (Opcode == Instruction::Add)
1440 return getIntAddReductionCost(NumVectors, ScalarBits);
1441 // The base cost is the same across all other arithmetic instructions
1443 getFastReductionCost(NumVectors, NumElems, ScalarBits);
1444 // But we need to account for the final op involving the scalar operand.
1445 if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul))
1446 Cost += 1;
1447 return Cost;
1448 }
1449 // otherwise, fall back to the standard implementation
1450 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1451}
1452
1455 FastMathFlags FMF,
1457 // Return custom costs only on subtargets with vector enhancements.
1458 if (ST->hasVectorEnhancements1()) {
1459 unsigned NumVectors = getNumVectorRegs(Ty);
1460 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1461 unsigned ScalarBits = Ty->getScalarSizeInBits();
1463 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1464 Cost += NumVectors - 1;
1465 // For the final vector, we need shuffle + min/max operations, and
1466 // we need #Elements - 1 of them.
1467 Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1);
1468 return Cost;
1469 }
1470 // For other targets, fall back to the standard implementation
1471 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1472}
1473
1474static int
1476 const SmallVectorImpl<Type *> &ParamTys) {
1477 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1478 return getNumVectorRegs(RetTy); // VPERM
1479
1480 return -1;
1481}
1482
1487 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
1488 if (Cost != -1)
1489 return Cost;
1491}
1492
1494 // Always expand on Subtargets without vector instructions.
1495 if (!ST->hasVector())
1496 return true;
1497
1498 // Whether or not to expand is a per-intrinsic decision.
1499 switch (II->getIntrinsicID()) {
1500 default:
1501 return true;
1502 // Do not expand vector.reduce.add...
1503 case Intrinsic::vector_reduce_add:
1504 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
1505 // ...unless the scalar size is i64 or larger,
1506 // or the operand vector is not full, since the
1507 // performance benefit is dubious in those cases.
1508 return VType->getScalarSizeInBits() >= 64 ||
1509 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1510 }
1511}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Hexagon Common GEP
const HexagonInstrInfo * TII
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
bool customCostReductions(unsigned Opcode)
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits)
static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, unsigned &NumLoads, const Function *F)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static bool isFreeEltLoad(Value *Op)
InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, unsigned ScalarBits)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl< Type * > &ParamTys)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:668
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:780
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:932
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1349
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
An instruction for reading from memory.
Definition: Instructions.h:176
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class wraps the llvm.memcpy intrinsic.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
const SystemZInstrInfo * getInstrInfo() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool shouldExpandReduction(const IntrinsicInst *II) const
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool hasDivRemOp(Type *DataType, bool IsSigned)
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const unsigned VectorBits
Definition: SystemZ.h:154
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:353
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ Global
Append to llvm.global_dtors.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...