LLVM 23.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/Support/Debug.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "systemztti"
31
32//===----------------------------------------------------------------------===//
33//
34// SystemZ cost model.
35//
36//===----------------------------------------------------------------------===//
37
38static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
39 bool UsedAsMemCpySource = false;
40 for (const User *U : V->users())
41 if (const Instruction *User = dyn_cast<Instruction>(U)) {
43 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
44 continue;
45 }
46 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
47 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
48 UsedAsMemCpySource = true;
49 continue;
50 }
51 }
52 OtherUse = true;
53 }
54 return UsedAsMemCpySource;
55}
56
57static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
58 unsigned &NumLoads, const Function *F) {
59 if (!isa<PointerType>(Ptr->getType()))
60 return;
61 for (const User *U : Ptr->users())
62 if (const Instruction *User = dyn_cast<Instruction>(U)) {
63 if (User->getParent()->getParent() == F) {
64 if (const auto *SI = dyn_cast<StoreInst>(User)) {
65 if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
66 NumStores++;
67 } else if (const auto *LI = dyn_cast<LoadInst>(User)) {
68 if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
69 NumLoads++;
70 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
71 if (GEP->getPointerOperand() == Ptr)
72 countNumMemAccesses(GEP, NumStores, NumLoads, F);
73 }
74 }
75 }
76}
77
79 unsigned Bonus = 0;
80 const Function *Caller = CB->getParent()->getParent();
81 const Function *Callee = CB->getCalledFunction();
82 if (!Callee)
83 return 0;
84
85 // Increase the threshold if an incoming argument is used only as a memcpy
86 // source.
87 for (const Argument &Arg : Callee->args()) {
88 bool OtherUse = false;
89 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
90 Bonus = 1000;
91 break;
92 }
93 }
94
95 // Give bonus for globals used much in both caller and a relatively small
96 // callee.
97 unsigned InstrCount = 0;
99 for (auto &I : instructions(Callee)) {
100 if (++InstrCount == 200) {
101 Ptr2NumUses.clear();
102 break;
103 }
104 if (const auto *SI = dyn_cast<StoreInst>(&I)) {
105 if (!SI->isVolatile())
106 if (auto *GV = dyn_cast<GlobalVariable>(SI->getPointerOperand()))
107 Ptr2NumUses[GV]++;
108 } else if (const auto *LI = dyn_cast<LoadInst>(&I)) {
109 if (!LI->isVolatile())
110 if (auto *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand()))
111 Ptr2NumUses[GV]++;
112 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
113 if (auto *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand())) {
114 unsigned NumStores = 0, NumLoads = 0;
115 countNumMemAccesses(GEP, NumStores, NumLoads, Callee);
116 Ptr2NumUses[GV] += NumLoads + NumStores;
117 }
118 }
119 }
120
121 for (auto [Ptr, NumCalleeUses] : Ptr2NumUses)
122 if (NumCalleeUses > 10) {
123 unsigned CallerStores = 0, CallerLoads = 0;
124 countNumMemAccesses(Ptr, CallerStores, CallerLoads, Caller);
125 if (CallerStores + CallerLoads > 10) {
126 Bonus = 1000;
127 break;
128 }
129 }
130
131 // Give bonus when Callee accesses an Alloca of Caller heavily.
132 unsigned NumStores = 0;
133 unsigned NumLoads = 0;
134 for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
135 Value *CallerArg = CB->getArgOperand(OpIdx);
136 Argument *CalleeArg = Callee->getArg(OpIdx);
137 if (isa<AllocaInst>(CallerArg))
138 countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
139 }
140 if (NumLoads > 10)
141 Bonus += NumLoads * 50;
142 if (NumStores > 10)
143 Bonus += NumStores * 50;
144 Bonus = std::min(Bonus, unsigned(1000));
145
146 LLVM_DEBUG(if (Bonus)
147 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
148 return Bonus;
149}
150
154 assert(Ty->isIntegerTy());
155
156 unsigned BitSize = Ty->getPrimitiveSizeInBits();
157 // There is no cost model for constants with a bit size of 0. Return TCC_Free
158 // here, so that constant hoisting will ignore this constant.
159 if (BitSize == 0)
160 return TTI::TCC_Free;
161 // No cost model for operations on integers larger than 128 bit implemented yet.
162 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
163 return TTI::TCC_Free;
164
165 if (Imm == 0)
166 return TTI::TCC_Free;
167
168 if (Imm.getBitWidth() <= 64) {
169 // Constants loaded via lgfi.
170 if (isInt<32>(Imm.getSExtValue()))
171 return TTI::TCC_Basic;
172 // Constants loaded via llilf.
173 if (isUInt<32>(Imm.getZExtValue()))
174 return TTI::TCC_Basic;
175 // Constants loaded via llihf:
176 if ((Imm.getZExtValue() & 0xffffffff) == 0)
177 return TTI::TCC_Basic;
178
179 return 2 * TTI::TCC_Basic;
180 }
181
182 // i128 immediates loads from Constant Pool
183 return 2 * TTI::TCC_Basic;
184}
185
187 const APInt &Imm, Type *Ty,
189 Instruction *Inst) const {
190 assert(Ty->isIntegerTy());
191
192 unsigned BitSize = Ty->getPrimitiveSizeInBits();
193 // There is no cost model for constants with a bit size of 0. Return TCC_Free
194 // here, so that constant hoisting will ignore this constant.
195 if (BitSize == 0)
196 return TTI::TCC_Free;
197 // No cost model for operations on integers larger than 64 bit implemented yet.
198 if (BitSize > 64)
199 return TTI::TCC_Free;
200
201 switch (Opcode) {
202 default:
203 return TTI::TCC_Free;
204 case Instruction::GetElementPtr:
205 // Always hoist the base address of a GetElementPtr. This prevents the
206 // creation of new constants for every base constant that gets constant
207 // folded with the offset.
208 if (Idx == 0)
209 return 2 * TTI::TCC_Basic;
210 return TTI::TCC_Free;
211 case Instruction::Store:
212 if (Idx == 0 && Imm.getBitWidth() <= 64) {
213 // Any 8-bit immediate store can by implemented via mvi.
214 if (BitSize == 8)
215 return TTI::TCC_Free;
216 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
217 if (isInt<16>(Imm.getSExtValue()))
218 return TTI::TCC_Free;
219 }
220 break;
221 case Instruction::ICmp:
222 if (Idx == 1 && Imm.getBitWidth() <= 64) {
223 // Comparisons against signed 32-bit immediates implemented via cgfi.
224 if (isInt<32>(Imm.getSExtValue()))
225 return TTI::TCC_Free;
226 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
227 if (isUInt<32>(Imm.getZExtValue()))
228 return TTI::TCC_Free;
229 }
230 break;
231 case Instruction::Add:
232 case Instruction::Sub:
233 if (Idx == 1 && Imm.getBitWidth() <= 64) {
234 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
235 if (isUInt<32>(Imm.getZExtValue()))
236 return TTI::TCC_Free;
237 // Or their negation, by swapping addition vs. subtraction.
238 if (isUInt<32>(-Imm.getSExtValue()))
239 return TTI::TCC_Free;
240 }
241 break;
242 case Instruction::Mul:
243 if (Idx == 1 && Imm.getBitWidth() <= 64) {
244 // We use msgfi to multiply by 32-bit signed immediates.
245 if (isInt<32>(Imm.getSExtValue()))
246 return TTI::TCC_Free;
247 }
248 break;
249 case Instruction::Or:
250 case Instruction::Xor:
251 if (Idx == 1 && Imm.getBitWidth() <= 64) {
252 // Masks supported by oilf/xilf.
253 if (isUInt<32>(Imm.getZExtValue()))
254 return TTI::TCC_Free;
255 // Masks supported by oihf/xihf.
256 if ((Imm.getZExtValue() & 0xffffffff) == 0)
257 return TTI::TCC_Free;
258 }
259 break;
260 case Instruction::And:
261 if (Idx == 1 && Imm.getBitWidth() <= 64) {
262 // Any 32-bit AND operation can by implemented via nilf.
263 if (BitSize <= 32)
264 return TTI::TCC_Free;
265 // 64-bit masks supported by nilf.
266 if (isUInt<32>(~Imm.getZExtValue()))
267 return TTI::TCC_Free;
268 // 64-bit masks supported by nilh.
269 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
270 return TTI::TCC_Free;
271 // Some 64-bit AND operations can be implemented via risbg.
272 const SystemZInstrInfo *TII = ST->getInstrInfo();
273 unsigned Start, End;
274 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
275 return TTI::TCC_Free;
276 }
277 break;
278 case Instruction::Shl:
279 case Instruction::LShr:
280 case Instruction::AShr:
281 // Always return TCC_Free for the shift value of a shift instruction.
282 if (Idx == 1)
283 return TTI::TCC_Free;
284 break;
285 case Instruction::UDiv:
286 case Instruction::SDiv:
287 case Instruction::URem:
288 case Instruction::SRem:
289 case Instruction::Trunc:
290 case Instruction::ZExt:
291 case Instruction::SExt:
292 case Instruction::IntToPtr:
293 case Instruction::PtrToInt:
294 case Instruction::BitCast:
295 case Instruction::PHI:
296 case Instruction::Call:
297 case Instruction::Select:
298 case Instruction::Ret:
299 case Instruction::Load:
300 break;
301 }
302
304}
305
308 const APInt &Imm, Type *Ty,
310 assert(Ty->isIntegerTy());
311
312 unsigned BitSize = Ty->getPrimitiveSizeInBits();
313 // There is no cost model for constants with a bit size of 0. Return TCC_Free
314 // here, so that constant hoisting will ignore this constant.
315 if (BitSize == 0)
316 return TTI::TCC_Free;
317 // No cost model for operations on integers larger than 64 bit implemented yet.
318 if (BitSize > 64)
319 return TTI::TCC_Free;
320
321 switch (IID) {
322 default:
323 return TTI::TCC_Free;
324 case Intrinsic::sadd_with_overflow:
325 case Intrinsic::uadd_with_overflow:
326 case Intrinsic::ssub_with_overflow:
327 case Intrinsic::usub_with_overflow:
328 // These get expanded to include a normal addition/subtraction.
329 if (Idx == 1 && Imm.getBitWidth() <= 64) {
330 if (isUInt<32>(Imm.getZExtValue()))
331 return TTI::TCC_Free;
332 if (isUInt<32>(-Imm.getSExtValue()))
333 return TTI::TCC_Free;
334 }
335 break;
336 case Intrinsic::smul_with_overflow:
337 case Intrinsic::umul_with_overflow:
338 // These get expanded to include a normal multiplication.
339 if (Idx == 1 && Imm.getBitWidth() <= 64) {
340 if (isInt<32>(Imm.getSExtValue()))
341 return TTI::TCC_Free;
342 }
343 break;
344 case Intrinsic::experimental_stackmap:
345 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
346 return TTI::TCC_Free;
347 break;
348 case Intrinsic::experimental_patchpoint_void:
349 case Intrinsic::experimental_patchpoint:
350 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
351 return TTI::TCC_Free;
352 break;
353 }
355}
356
358SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) const {
359 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
360 if (ST->hasPopulationCount() && TyWidth <= 64)
362 return TTI::PSK_Software;
363}
364
367 OptimizationRemarkEmitter *ORE) const {
368 // Find out if L contains a call, what the machine instruction count
369 // estimate is, and how many stores there are.
370 bool HasCall = false;
371 InstructionCost NumStores = 0;
372 for (auto &BB : L->blocks())
373 for (auto &I : *BB) {
374 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
375 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
376 if (isLoweredToCall(F))
377 HasCall = true;
378 if (F->getIntrinsicID() == Intrinsic::memcpy ||
379 F->getIntrinsicID() == Intrinsic::memset)
380 NumStores++;
381 } else { // indirect call.
382 HasCall = true;
383 }
384 }
385 if (isa<StoreInst>(&I)) {
386 Type *MemAccessTy = I.getOperand(0)->getType();
387 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, Align(),
389 }
390 }
391
392 // The z13 processor will run out of store tags if too many stores
393 // are fed into it too quickly. Therefore make sure there are not
394 // too many stores in the resulting unrolled loop.
395 unsigned const NumStoresVal = NumStores.getValue();
396 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
397
398 if (HasCall) {
399 // Only allow full unrolling if loop has any calls.
400 UP.FullUnrollMaxCount = Max;
401 UP.MaxCount = 1;
402 return;
403 }
404
405 UP.MaxCount = Max;
406 if (UP.MaxCount <= 1)
407 return;
408
409 // Allow partial and runtime trip count unrolling.
410 UP.Partial = UP.Runtime = true;
411
412 UP.PartialThreshold = 75;
414
415 // Allow expensive instructions in the pre-header of the loop.
416 UP.AllowExpensiveTripCount = true;
417
418 UP.Force = true;
419}
420
425
428 const TargetTransformInfo::LSRCost &C2) const {
429 // SystemZ specific: check instruction count (first), and don't care about
430 // ImmCost, since offsets are checked explicitly.
431 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
432 C1.NumIVMuls, C1.NumBaseAdds,
433 C1.ScaleCost, C1.SetupCost) <
434 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
435 C2.NumIVMuls, C2.NumBaseAdds,
436 C2.ScaleCost, C2.SetupCost);
437}
438
439unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
440 bool Vector = (ClassID == 1);
441 if (!Vector)
442 // Discount the stack pointer. Also leave out %r0, since it can't
443 // be used in an address.
444 return 14;
445 if (ST->hasVector())
446 return 32;
447 return 0;
448}
449
452 switch (K) {
454 return TypeSize::getFixed(64);
456 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
458 return TypeSize::getScalable(0);
459 }
460
461 llvm_unreachable("Unsupported register kind");
462}
463
464unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
465 unsigned NumStridedMemAccesses,
466 unsigned NumPrefetches,
467 bool HasCall) const {
468 // Don't prefetch a loop with many far apart accesses.
469 if (NumPrefetches > 16)
470 return UINT_MAX;
471
472 // Emit prefetch instructions for smaller strides in cases where we think
473 // the hardware prefetcher might not be able to keep up.
474 if (NumStridedMemAccesses > 32 && !HasCall &&
475 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
476 return 1;
477
478 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
479}
480
481unsigned
483 bool HasUnorderedReductions) const {
484 return VF.isVector() ? 8 : 1;
485}
486
487bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) const {
488 EVT VT = TLI->getValueType(DL, DataType);
489 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
490}
491
492static bool isFreeEltLoad(const Value *Op) {
493 if (isa<LoadInst>(Op) && Op->hasOneUse()) {
494 const Instruction *UserI = cast<Instruction>(*Op->user_begin());
495 return !isa<StoreInst>(UserI); // Prefer MVC
496 }
497 return false;
498}
499
501 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
502 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
503 TTI::VectorInstrContext VIC) const {
504 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
506
507 if (Insert && Ty->isIntOrIntVectorTy(64)) {
508 // VLVGP will insert two GPRs with one instruction, while VLE will load
509 // an element directly with no extra cost
510 assert((VL.empty() || VL.size() == NumElts) &&
511 "Type does not match the number of values.");
512 InstructionCost CurrVectorCost = 0;
513 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
514 if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx])))
515 ++CurrVectorCost;
516 if (Idx % 2 == 1) {
517 Cost += std::min(InstructionCost(1), CurrVectorCost);
518 CurrVectorCost = 0;
519 }
520 }
521 Insert = false;
522 }
523
524 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
525 CostKind, ForPoisonSrc, VL);
526 return Cost;
527}
528
529// Return the bit size for the scalar type or vector element
530// type. getScalarSizeInBits() returns 0 for a pointer type.
531static unsigned getScalarSizeInBits(Type *Ty) {
532 unsigned Size =
533 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
534 assert(Size > 0 && "Element must have non-zero size.");
535 return Size;
536}
537
538// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
539// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
540// 3.
541static unsigned getNumVectorRegs(Type *Ty) {
542 auto *VTy = cast<FixedVectorType>(Ty);
543 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
544 assert(WideBits > 0 && "Could not compute size of vector");
545 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
546}
547
548static bool isFoldableRMW(const Instruction *I, Type *Ty) {
550 if (!BI || !BI->hasOneUse())
551 return false;
552
553 unsigned Opcode = BI->getOpcode();
554 unsigned BitWidth = Ty->getScalarSizeInBits();
555
556 switch (Opcode) {
557 case Instruction::And:
558 case Instruction::Or:
559 case Instruction::Xor:
560 if (BitWidth != 8)
561 return false;
562 break;
563 case Instruction::Add:
564 case Instruction::Sub:
565 if (BitWidth != 32 && BitWidth != 64)
566 return false;
567 break;
568 default:
569 return false;
570 }
571
572 Value *Op0 = BI->getOperand(0), *Op1 = BI->getOperand(1);
573 if (!isa<ConstantInt>(Op0) && !isa<ConstantInt>(Op1))
574 return false;
575
576 Value *V =
577 (Opcode == Instruction::Sub) ? Op0 : (isa<ConstantInt>(Op0) ? Op1 : Op0);
578 if (Opcode == Instruction::Sub && !isa<ConstantInt>(Op1))
579 return false;
580
581 auto *LI = dyn_cast_or_null<LoadInst>(V);
582 // Already checked BI hasOneUse.
583 auto *SI = dyn_cast<StoreInst>(BI->user_back());
584
585 return LI && SI && !LI->isVolatile() && !SI->isVolatile() &&
586 LI->hasOneUse() && LI->getPointerOperand() == SI->getPointerOperand();
587}
588
590 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
592 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
593
594 // TODO: Handle more cost kinds.
596 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
597 Op2Info, Args, CxtI);
598 if (CxtI && Ty && !Ty->isVectorTy() && isFoldableRMW(CxtI, Ty))
599 return TTI::TCC_Free;
600 // TODO: return a good value for BB-VECTORIZER that includes the
601 // immediate loads, which we do not want to count for the loop
602 // vectorizer, since they are hopefully hoisted out of the loop. This
603 // would require a new parameter 'InLoop', but not sure if constant
604 // args are common enough to motivate this.
605
606 unsigned ScalarBits = Ty->getScalarSizeInBits();
607
608 // There are thre cases of division and remainder: Dividing with a register
609 // needs a divide instruction. A divisor which is a power of two constant
610 // can be implemented with a sequence of shifts. Any other constant needs a
611 // multiply and shifts.
612 const unsigned DivInstrCost = 20;
613 const unsigned DivMulSeqCost = 10;
614 const unsigned SDivPow2Cost = 4;
615
616 bool SignedDivRem =
617 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
618 bool UnsignedDivRem =
619 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
620
621 // Check for a constant divisor.
622 bool DivRemConst = false;
623 bool DivRemConstPow2 = false;
624 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
625 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
626 const ConstantInt *CVal =
627 (C->getType()->isVectorTy()
628 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
630 if (CVal && (CVal->getValue().isPowerOf2() ||
631 CVal->getValue().isNegatedPowerOf2()))
632 DivRemConstPow2 = true;
633 else
634 DivRemConst = true;
635 }
636 }
637
638 if (!Ty->isVectorTy()) {
639 // These FP operations are supported with a dedicated instruction for
640 // float, double and fp128 (base implementation assumes float generally
641 // costs 2).
642 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
643 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
644 return 1;
645
646 // There is no native support for FRem.
647 if (Opcode == Instruction::FRem)
648 return LIBCALL_COST;
649
650 // Give discount for some combined logical operations if supported.
651 if (Args.size() == 2) {
652 if (Opcode == Instruction::Xor) {
653 for (const Value *A : Args) {
654 if (const Instruction *I = dyn_cast<Instruction>(A))
655 if (I->hasOneUse() &&
656 (I->getOpcode() == Instruction::Or ||
657 I->getOpcode() == Instruction::And ||
658 I->getOpcode() == Instruction::Xor))
659 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
660 (isInt128InVR(Ty) &&
661 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
662 return 0;
663 }
664 }
665 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
666 for (const Value *A : Args) {
667 if (const Instruction *I = dyn_cast<Instruction>(A))
668 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
669 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
670 (isInt128InVR(Ty) &&
671 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
672 return 0;
673 }
674 }
675 }
676
677 // Or requires one instruction, although it has custom handling for i64.
678 if (Opcode == Instruction::Or)
679 return 1;
680
681 if (Opcode == Instruction::Xor && ScalarBits == 1) {
682 if (ST->hasLoadStoreOnCond2())
683 return 5; // 2 * (li 0; loc 1); xor
684 return 7; // 2 * ipm sequences ; xor ; shift ; compare
685 }
686
687 if (DivRemConstPow2)
688 return (SignedDivRem ? SDivPow2Cost : 1);
689 if (DivRemConst)
690 return DivMulSeqCost;
691 if (SignedDivRem || UnsignedDivRem)
692 return DivInstrCost;
693 }
694 else if (ST->hasVector()) {
695 auto *VTy = cast<FixedVectorType>(Ty);
696 unsigned VF = VTy->getNumElements();
697 unsigned NumVectors = getNumVectorRegs(Ty);
698
699 // These vector operations are custom handled, but are still supported
700 // with one instruction per vector, regardless of element size.
701 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
702 Opcode == Instruction::AShr) {
703 return NumVectors;
704 }
705
706 if (DivRemConstPow2)
707 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
708 if (DivRemConst) {
709 SmallVector<Type *> Tys(Args.size(), Ty);
710 return VF * DivMulSeqCost +
712 }
713 if (SignedDivRem || UnsignedDivRem) {
714 if (ST->hasVectorEnhancements3() && ScalarBits >= 32)
715 return NumVectors * DivInstrCost;
716 else if (VF > 4)
717 // Temporary hack: disable high vectorization factors with integer
718 // division/remainder, which will get scalarized and handled with
719 // GR128 registers. The mischeduler is not clever enough to avoid
720 // spilling yet.
721 return 1000;
722 }
723
724 // These FP operations are supported with a single vector instruction for
725 // double (base implementation assumes float generally costs 2). For
726 // FP128, the scalar cost is 1, and there is no overhead since the values
727 // are already in scalar registers.
728 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
729 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
730 switch (ScalarBits) {
731 case 32: {
732 // The vector enhancements facility 1 provides v4f32 instructions.
733 if (ST->hasVectorEnhancements1())
734 return NumVectors;
735 // Return the cost of multiple scalar invocation plus the cost of
736 // inserting and extracting the values.
737 InstructionCost ScalarCost =
738 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
739 SmallVector<Type *> Tys(Args.size(), Ty);
741 (VF * ScalarCost) +
743 // FIXME: VF 2 for these FP operations are currently just as
744 // expensive as for VF 4.
745 if (VF == 2)
746 Cost *= 2;
747 return Cost;
748 }
749 case 64:
750 case 128:
751 return NumVectors;
752 default:
753 break;
754 }
755 }
756
757 // There is no native support for FRem.
758 if (Opcode == Instruction::FRem) {
759 SmallVector<Type *> Tys(Args.size(), Ty);
761 (VF * LIBCALL_COST) +
763 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
764 if (VF == 2 && ScalarBits == 32)
765 Cost *= 2;
766 return Cost;
767 }
768 }
769
770 // Fallback to the default implementation.
771 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
772 Args, CxtI);
773}
774
777 VectorType *SrcTy, ArrayRef<int> Mask,
778 TTI::TargetCostKind CostKind, int Index,
780 const Instruction *CxtI) const {
781 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
782 if (ST->hasVector()) {
783 unsigned NumVectors = getNumVectorRegs(SrcTy);
784
785 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
786
787 // FP128 values are always in scalar registers, so there is no work
788 // involved with a shuffle, except for broadcast. In that case register
789 // moves are done with a single instruction per element.
790 if (SrcTy->getScalarType()->isFP128Ty())
791 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
792
793 switch (Kind) {
795 // ExtractSubvector Index indicates start offset.
796
797 // Extracting a subvector from first index is a noop.
798 return (Index == 0 ? 0 : NumVectors);
799
801 // Loop vectorizer calls here to figure out the extra cost of
802 // broadcasting a loaded value to all elements of a vector. Since vlrep
803 // loads and replicates with a single instruction, adjust the returned
804 // value.
805 return NumVectors - 1;
806
807 default:
808
809 // SystemZ supports single instruction permutation / replication.
810 return NumVectors;
811 }
812 }
813
814 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
815 SubTp);
816}
817
818// Return the log2 difference of the element sizes of the two vector types.
819static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
820 unsigned Bits0 = getScalarSizeInBits(Ty0);
821 unsigned Bits1 = getScalarSizeInBits(Ty1);
822
823 if (Bits1 > Bits0)
824 return (Log2_32(Bits1) - Log2_32(Bits0));
825
826 return (Log2_32(Bits0) - Log2_32(Bits1));
827}
828
829// Return the number of instructions needed to truncate SrcTy to DstTy.
830unsigned SystemZTTIImpl::getVectorTruncCost(Type *SrcTy, Type *DstTy) const {
831 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
833 "Packing must reduce size of vector type.");
836 "Packing should not change number of elements.");
837
838 // TODO: Since fp32 is expanded, the extract cost should always be 0.
839
840 unsigned NumParts = getNumVectorRegs(SrcTy);
841 if (NumParts <= 2)
842 // Up to 2 vector registers can be truncated efficiently with pack or
843 // permute. The latter requires an immediate mask to be loaded, which
844 // typically gets hoisted out of a loop. TODO: return a good value for
845 // BB-VECTORIZER that includes the immediate loads, which we do not want
846 // to count for the loop vectorizer.
847 return 1;
848
849 unsigned Cost = 0;
850 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
851 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
852 for (unsigned P = 0; P < Log2Diff; ++P) {
853 if (NumParts > 1)
854 NumParts /= 2;
855 Cost += NumParts;
856 }
857
858 // Currently, a general mix of permutes and pack instructions is output by
859 // isel, which follow the cost computation above except for this case which
860 // is one instruction less:
861 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
862 DstTy->getScalarSizeInBits() == 8)
863 Cost--;
864
865 return Cost;
866}
867
868// Return the cost of converting a vector bitmask produced by a compare
869// (SrcTy), to the type of the select or extend instruction (DstTy).
871 Type *DstTy) const {
872 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
873 "Should only be called with vector types.");
874
875 unsigned PackCost = 0;
876 unsigned SrcScalarBits = getScalarSizeInBits(SrcTy);
877 unsigned DstScalarBits = getScalarSizeInBits(DstTy);
878 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
879 if (SrcScalarBits > DstScalarBits)
880 // The bitmask will be truncated.
881 PackCost = getVectorTruncCost(SrcTy, DstTy);
882 else if (SrcScalarBits < DstScalarBits) {
883 unsigned DstNumParts = getNumVectorRegs(DstTy);
884 // Each vector select needs its part of the bitmask unpacked.
885 PackCost = Log2Diff * DstNumParts;
886 // Extra cost for moving part of mask before unpacking.
887 PackCost += DstNumParts - 1;
888 }
889
890 return PackCost;
891}
892
893// Return the type of the compared operands. This is needed to compute the
894// cost for a Select / ZExt or SExt instruction.
895static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
896 Type *OpTy = nullptr;
897 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
898 OpTy = CI->getOperand(0)->getType();
899 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
900 if (LogicI->getNumOperands() == 2)
901 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
902 if (isa<CmpInst>(LogicI->getOperand(1)))
903 OpTy = CI0->getOperand(0)->getType();
904
905 if (OpTy != nullptr) {
906 if (VF == 1) {
907 assert (!OpTy->isVectorTy() && "Expected scalar type");
908 return OpTy;
909 }
910 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
911 // be either scalar or already vectorized with a same or lesser VF.
912 Type *ElTy = OpTy->getScalarType();
913 return FixedVectorType::get(ElTy, VF);
914 }
915
916 return nullptr;
917}
918
919// Get the cost of converting a boolean vector to a vector with same width
920// and element size as Dst, plus the cost of zero extending if needed.
921unsigned
923 const Instruction *I) const {
924 auto *DstVTy = cast<FixedVectorType>(Dst);
925 unsigned VF = DstVTy->getNumElements();
926 unsigned Cost = 0;
927 // If we know what the widths of the compared operands, get any cost of
928 // converting it to match Dst. Otherwise assume same widths.
929 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
930 if (CmpOpTy != nullptr)
931 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
932 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
933 // One 'vn' per dst vector with an immediate mask.
934 Cost += getNumVectorRegs(Dst);
935 return Cost;
936}
937
939 Type *Src,
942 const Instruction *I) const {
943 // FIXME: Can the logic below also be used for these cost kinds?
945 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
946 return BaseCost == 0 ? BaseCost : 1;
947 }
948
949 unsigned DstScalarBits = Dst->getScalarSizeInBits();
950 unsigned SrcScalarBits = Src->getScalarSizeInBits();
951
952 if (!Src->isVectorTy()) {
953 if (Dst->isVectorTy())
954 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
955
956 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
957 if (Src->isIntegerTy(128))
958 return LIBCALL_COST;
959 if (SrcScalarBits >= 32 ||
960 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
961 return 1;
962 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
963 }
964
965 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
966 Dst->isIntegerTy(128))
967 return LIBCALL_COST;
968
969 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
970 if (Src->isIntegerTy(1)) {
971 if (DstScalarBits == 128) {
972 if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3())
973 return 0;/*VCEQQ*/
974 return 5 /*branch seq.*/;
975 }
976
977 if (ST->hasLoadStoreOnCond2())
978 return 2; // li 0; loc 1
979
980 // This should be extension of a compare i1 result, which is done with
981 // ipm and a varying sequence of instructions.
982 unsigned Cost = 0;
983 if (Opcode == Instruction::SExt)
984 Cost = (DstScalarBits < 64 ? 3 : 4);
985 if (Opcode == Instruction::ZExt)
986 Cost = 3;
987 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
988 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
989 // If operands of an fp-type was compared, this costs +1.
990 Cost++;
991 return Cost;
992 }
993 else if (isInt128InVR(Dst)) {
994 // Extensions from GPR to i128 (in VR) typically costs two instructions,
995 // but a zero-extending load would be just one extra instruction.
996 if (Opcode == Instruction::ZExt && I != nullptr)
997 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
998 if (Ld->hasOneUse())
999 return 1;
1000 return 2;
1001 }
1002 }
1003
1004 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
1005 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
1006 if (Ld->hasOneUse())
1007 return 0; // Will be converted to GPR load.
1008 bool OnlyTruncatingStores = true;
1009 for (const User *U : I->users())
1010 if (!isa<StoreInst>(U)) {
1011 OnlyTruncatingStores = false;
1012 break;
1013 }
1014 if (OnlyTruncatingStores)
1015 return 0;
1016 return 2; // Vector element extraction.
1017 }
1018 }
1019 else if (ST->hasVector()) {
1020 // Vector to scalar cast.
1021 auto *SrcVecTy = cast<FixedVectorType>(Src);
1022 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
1023 if (!DstVecTy) {
1024 // TODO: tune vector-to-scalar cast.
1025 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1026 }
1027 unsigned VF = SrcVecTy->getNumElements();
1028 unsigned NumDstVectors = getNumVectorRegs(Dst);
1029 unsigned NumSrcVectors = getNumVectorRegs(Src);
1030
1031 if (Opcode == Instruction::Trunc) {
1032 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
1033 return 0; // Check for NOOP conversions.
1034 return getVectorTruncCost(Src, Dst);
1035 }
1036
1037 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
1038 if (SrcScalarBits >= 8) {
1039 // ZExt will use either a single unpack or a vector permute.
1040 if (Opcode == Instruction::ZExt)
1041 return NumDstVectors;
1042
1043 // SExt will be handled with one unpack per doubling of width.
1044 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
1045
1046 // For types that spans multiple vector registers, some additional
1047 // instructions are used to setup the unpacking.
1048 unsigned NumSrcVectorOps =
1049 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
1050 : (NumDstVectors / 2));
1051
1052 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
1053 }
1054 else if (SrcScalarBits == 1)
1055 return getBoolVecToIntConversionCost(Opcode, Dst, I);
1056 }
1057
1058 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
1059 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
1060 // TODO: Fix base implementation which could simplify things a bit here
1061 // (seems to miss on differentiating on scalar/vector types).
1062
1063 // Only 64 bit vector conversions are natively supported before z15.
1064 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
1065 if (SrcScalarBits == DstScalarBits)
1066 return NumDstVectors;
1067
1068 if (SrcScalarBits == 1)
1069 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
1070 }
1071
1072 // Return the cost of multiple scalar invocation plus the cost of
1073 // inserting and extracting the values. Base implementation does not
1074 // realize float->int gets scalarized.
1075 InstructionCost ScalarCost = getCastInstrCost(
1076 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
1077 InstructionCost TotCost = VF * ScalarCost;
1078 bool NeedsInserts = true, NeedsExtracts = true;
1079 // FP128 registers do not get inserted or extracted.
1080 if (DstScalarBits == 128 &&
1081 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
1082 NeedsInserts = false;
1083 if (SrcScalarBits == 128 &&
1084 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
1085 NeedsExtracts = false;
1086
1087 TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1088 NeedsExtracts, CostKind);
1089 TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
1090 /*Extract*/ false, CostKind);
1091
1092 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1093 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
1094 TotCost *= 2;
1095
1096 return TotCost;
1097 }
1098
1099 if (Opcode == Instruction::FPTrunc) {
1100 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
1101 return VF /*ldxbr/lexbr*/ +
1102 BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
1103 /*Extract*/ false, CostKind);
1104 else // double -> float
1105 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
1106 }
1107
1108 if (Opcode == Instruction::FPExt) {
1109 if (SrcScalarBits == 32 && DstScalarBits == 64) {
1110 // float -> double is very rare and currently unoptimized. Instead of
1111 // using vldeb, which can do two at a time, all conversions are
1112 // scalarized.
1113 return VF * 2;
1114 }
1115 // -> fp128. VF * lxdb/lxeb + extraction of elements.
1116 return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1117 /*Extract*/ true, CostKind);
1118 }
1119 }
1120
1121 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1122}
1123
1124// Scalar i8 / i16 operations will typically be made after first extending
1125// the operands to i32.
1126static unsigned getOperandsExtensionCost(const Instruction *I) {
1127 unsigned ExtCost = 0;
1128 for (Value *Op : I->operands())
1129 // A load of i8 or i16 sign/zero extends to i32.
1131 ExtCost++;
1132
1133 return ExtCost;
1134}
1135
1138 const Instruction *I) const {
1140 return Opcode == Instruction::PHI ? TTI::TCC_Free : TTI::TCC_Basic;
1141 // Branches are assumed to be predicted.
1142 return TTI::TCC_Free;
1143}
1144
1146 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1148 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
1150 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1151 Op1Info, Op2Info);
1152
1153 if (!ValTy->isVectorTy()) {
1154 switch (Opcode) {
1155 case Instruction::ICmp: {
1156 // A loaded value compared with 0 with multiple users becomes Load and
1157 // Test. The load is then not foldable, so return 0 cost for the ICmp.
1158 unsigned ScalarBits = ValTy->getScalarSizeInBits();
1159 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
1160 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
1161 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
1162 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
1163 C->isZero())
1164 return 0;
1165
1166 unsigned Cost = 1;
1167 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
1168 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
1169 return Cost;
1170 }
1171 case Instruction::Select:
1172 if (ValTy->isFloatingPointTy())
1173 return 4; // No LOC for FP - costs a conditional jump.
1174
1175 // When selecting based on an i128 comparison, LOC / VSEL is possible
1176 // if i128 comparisons are directly supported.
1177 if (I != nullptr)
1178 if (ICmpInst *CI = dyn_cast<ICmpInst>(I->getOperand(0)))
1179 if (CI->getOperand(0)->getType()->isIntegerTy(128))
1180 return ST->hasVectorEnhancements3() ? 1 : 4;
1181
1182 // Load On Condition / Select Register available, except for i128.
1183 return !isInt128InVR(ValTy) ? 1 : 4;
1184 }
1185 }
1186 else if (ST->hasVector()) {
1187 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
1188
1189 // Called with a compare instruction.
1190 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
1191 unsigned PredicateExtraCost = 0;
1192 if (I != nullptr) {
1193 // Some predicates cost one or two extra instructions.
1194 switch (cast<CmpInst>(I)->getPredicate()) {
1200 PredicateExtraCost = 1;
1201 break;
1206 PredicateExtraCost = 2;
1207 break;
1208 default:
1209 break;
1210 }
1211 }
1212
1213 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1214 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1215 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1216 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1217
1218 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1219 return Cost;
1220 }
1221 else { // Called with a select instruction.
1222 assert (Opcode == Instruction::Select);
1223
1224 // We can figure out the extra cost of packing / unpacking if the
1225 // instruction was passed and the compare instruction is found.
1226 unsigned PackCost = 0;
1227 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1228 if (CmpOpTy != nullptr)
1229 PackCost =
1230 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1231
1232 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1233 }
1234 }
1235
1236 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1237 Op1Info, Op2Info);
1238}
1239
1241 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1242 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1243 if (Opcode == Instruction::InsertElement) {
1244 // Vector Element Load.
1245 if (Op1 != nullptr && isFreeEltLoad(Op1))
1246 return 0;
1247
1248 // vlvgp will insert two grs into a vector register, so count half the
1249 // number of instructions as an estimate when we don't have the full
1250 // picture (as in getScalarizationOverhead()).
1251 if (Val->isIntOrIntVectorTy(64))
1252 return ((Index % 2 == 0) ? 1 : 0);
1253 }
1254
1255 if (Opcode == Instruction::ExtractElement) {
1256 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1257
1258 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1259 if (Index == 0 && Val->isIntOrIntVectorTy())
1260 Cost += 1;
1261
1262 return Cost;
1263 }
1264
1265 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1, VIC);
1266}
1267
1268// Check if a load may be folded as a memory operand in its user.
1270 const Instruction *&FoldedValue) const {
1271 if (!Ld->hasOneUse())
1272 return false;
1273 FoldedValue = Ld;
1274 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1275 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1276 unsigned TruncBits = 0;
1277 unsigned SExtBits = 0;
1278 unsigned ZExtBits = 0;
1279 if (UserI->hasOneUse()) {
1280 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1281 if (isa<TruncInst>(UserI))
1282 TruncBits = UserBits;
1283 else if (isa<SExtInst>(UserI))
1284 SExtBits = UserBits;
1285 else if (isa<ZExtInst>(UserI))
1286 ZExtBits = UserBits;
1287 }
1288 if (TruncBits || SExtBits || ZExtBits) {
1289 FoldedValue = UserI;
1290 UserI = cast<Instruction>(*UserI->user_begin());
1291 // Load (single use) -> trunc/extend (single use) -> UserI
1292 }
1293 if ((UserI->getOpcode() == Instruction::Sub ||
1294 UserI->getOpcode() == Instruction::SDiv ||
1295 UserI->getOpcode() == Instruction::UDiv) &&
1296 UserI->getOperand(1) != FoldedValue)
1297 return false; // Not commutative, only RHS foldable.
1298 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1299 // extension was made of the load.
1300 unsigned LoadOrTruncBits =
1301 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1302 switch (UserI->getOpcode()) {
1303 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1304 case Instruction::Sub:
1305 case Instruction::ICmp:
1306 if (LoadedBits == 32 && ZExtBits == 64)
1307 return true;
1308 [[fallthrough]];
1309 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1310 if (UserI->getOpcode() != Instruction::ICmp) {
1311 if (LoadedBits == 16 &&
1312 (SExtBits == 32 ||
1313 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1314 return true;
1315 if (LoadOrTruncBits == 16)
1316 return true;
1317 }
1318 [[fallthrough]];
1319 case Instruction::SDiv:// SE: 32->64
1320 if (LoadedBits == 32 && SExtBits == 64)
1321 return true;
1322 [[fallthrough]];
1323 case Instruction::UDiv:
1324 case Instruction::And:
1325 case Instruction::Or:
1326 case Instruction::Xor:
1327 // This also makes sense for float operations, but disabled for now due
1328 // to regressions.
1329 // case Instruction::FCmp:
1330 // case Instruction::FAdd:
1331 // case Instruction::FSub:
1332 // case Instruction::FMul:
1333 // case Instruction::FDiv:
1334
1335 // All possible extensions of memory checked above.
1336
1337 // Comparison between memory and immediate.
1338 if (UserI->getOpcode() == Instruction::ICmp)
1339 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1340 if (CI->getValue().isIntN(16))
1341 return true;
1342 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1343 break;
1344 }
1345 return false;
1346}
1347
1348static bool isBswapIntrinsicCall(const Value *V) {
1349 if (const Instruction *I = dyn_cast<Instruction>(V))
1350 if (auto *CI = dyn_cast<CallInst>(I))
1351 if (auto *F = CI->getCalledFunction())
1352 if (F->getIntrinsicID() == Intrinsic::bswap)
1353 return true;
1354 return false;
1355}
1356
1358 Align Alignment,
1359 unsigned AddressSpace,
1361 TTI::OperandValueInfo OpInfo,
1362 const Instruction *I) const {
1363 assert(!Src->isVoidTy() && "Invalid type");
1364
1365 // FIXME: Load latency isn't handled here
1366 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
1367 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1368 CostKind, OpInfo, I);
1369
1370 // TODO: Handle other cost kinds.
1372 return 1;
1373
1374 if (I && Opcode == Instruction::Store && !Src->isVectorTy()) {
1375 if (isFoldableRMW(dyn_cast<Instruction>(I->getOperand(0)), Src))
1376 return TTI::TCC_Free;
1377 }
1378
1379 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1380 // Store the load or its truncated or extended value in FoldedValue.
1381 const Instruction *FoldedValue = nullptr;
1382 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1383 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1384 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1385
1386 // UserI can't fold two loads, so in that case return 0 cost only
1387 // half of the time.
1388 for (unsigned i = 0; i < 2; ++i) {
1389 if (UserI->getOperand(i) == FoldedValue)
1390 continue;
1391
1392 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1393 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1394 if (!OtherLoad &&
1395 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1396 isa<ZExtInst>(OtherOp)))
1397 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1398 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1399 return i == 0; // Both operands foldable.
1400 }
1401 }
1402
1403 return 0; // Only I is foldable in user.
1404 }
1405 }
1406
1407 // Type legalization (via getNumberOfParts) can't handle structs
1408 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1409 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1410 CostKind);
1411
1412 // FP128 is a legal type but kept in a register pair on older CPUs.
1413 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1414 return 2;
1415
1416 unsigned NumOps =
1417 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1418
1419 // Store/Load reversed saves one instruction.
1420 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1421 I != nullptr) {
1422 if (Opcode == Instruction::Load && I->hasOneUse()) {
1423 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1424 // In case of load -> bswap -> store, return normal cost for the load.
1425 if (isBswapIntrinsicCall(LdUser) &&
1426 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1427 return 0;
1428 }
1429 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1430 const Value *StoredVal = SI->getValueOperand();
1431 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1432 return 0;
1433 }
1434 }
1435
1436 return NumOps;
1437}
1438
1439// The generic implementation of getInterleavedMemoryOpCost() is based on
1440// adding costs of the memory operations plus all the extracts and inserts
1441// needed for using / defining the vector operands. The SystemZ version does
1442// roughly the same but bases the computations on vector permutations
1443// instead.
1445 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1446 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1447 bool UseMaskForCond, bool UseMaskForGaps) const {
1448 if (UseMaskForCond || UseMaskForGaps)
1449 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1450 Alignment, AddressSpace, CostKind,
1451 UseMaskForCond, UseMaskForGaps);
1452 assert(isa<VectorType>(VecTy) &&
1453 "Expect a vector type for interleaved memory op");
1454
1455 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1456 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1457 unsigned VF = NumElts / Factor;
1458 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1459 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1460 unsigned NumPermutes = 0;
1461
1462 if (Opcode == Instruction::Load) {
1463 // Loading interleave groups may have gaps, which may mean fewer
1464 // loads. Find out how many vectors will be loaded in total, and in how
1465 // many of them each value will be in.
1466 BitVector UsedInsts(NumVectorMemOps, false);
1467 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1468 for (unsigned Index : Indices)
1469 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1470 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1471 UsedInsts.set(Vec);
1472 ValueVecs[Index].set(Vec);
1473 }
1474 NumVectorMemOps = UsedInsts.count();
1475
1476 for (unsigned Index : Indices) {
1477 // Estimate that each loaded source vector containing this Index
1478 // requires one operation, except that vperm can handle two input
1479 // registers first time for each dst vector.
1480 unsigned NumSrcVecs = ValueVecs[Index].count();
1481 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1482 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1483 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1484 }
1485 } else {
1486 // Estimate the permutes for each stored vector as the smaller of the
1487 // number of elements and the number of source vectors. Subtract one per
1488 // dst vector for vperm (S.A.).
1489 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1490 unsigned NumDstVecs = NumVectorMemOps;
1491 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1492 }
1493
1494 // Cost of load/store operations and the permutations needed.
1495 return NumVectorMemOps + NumPermutes;
1496}
1497
1498InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1499 InstructionCost Cost = 0;
1500 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1501 Cost += NumVec - 1;
1502 // For integer adds, VSUM creates shorter reductions on the final vector.
1503 Cost += (ScalarBits < 32) ? 3 : 2;
1504 return Cost;
1505}
1506
1507InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1508 unsigned ScalarBits) {
1509 unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1510 InstructionCost Cost = 0;
1511 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1512 Cost += NumVec - 1;
1513 // For each shuffle / arithmetic layer, we need 2 instructions, and we need
1514 // log2(Elements in Last Vector) layers.
1515 Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg));
1516 return Cost;
1517}
1518
1519inline bool customCostReductions(unsigned Opcode) {
1520 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1521 Opcode == Instruction::Add || Opcode == Instruction::Mul;
1522}
1523
1526 std::optional<FastMathFlags> FMF,
1528 unsigned ScalarBits = Ty->getScalarSizeInBits();
1529 // The following is only for subtargets with vector math, non-ordered
1530 // reductions, and reasonable scalar sizes for int and fp add/mul.
1531 if (customCostReductions(Opcode) && ST->hasVector() &&
1533 ScalarBits <= SystemZ::VectorBits) {
1534 unsigned NumVectors = getNumVectorRegs(Ty);
1535 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1536 // Integer Add is using custom code gen, that needs to be accounted for.
1537 if (Opcode == Instruction::Add)
1538 return getIntAddReductionCost(NumVectors, ScalarBits);
1539 // The base cost is the same across all other arithmetic instructions
1541 getFastReductionCost(NumVectors, NumElems, ScalarBits);
1542 // But we need to account for the final op involving the scalar operand.
1543 if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul))
1544 Cost += 1;
1545 return Cost;
1546 }
1547 // otherwise, fall back to the standard implementation
1548 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1549}
1550
1553 FastMathFlags FMF,
1555 // Return custom costs only on subtargets with vector enhancements.
1556 if (ST->hasVectorEnhancements1()) {
1557 unsigned NumVectors = getNumVectorRegs(Ty);
1558 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1559 unsigned ScalarBits = Ty->getScalarSizeInBits();
1561 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1562 Cost += NumVectors - 1;
1563 // For the final vector, we need shuffle + min/max operations, and
1564 // we need #Elements - 1 of them.
1565 Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1);
1566 return Cost;
1567 }
1568 // For other targets, fall back to the standard implementation
1569 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1570}
1571
1572static int
1574 const SmallVectorImpl<Type *> &ParamTys) {
1575 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1576 return getNumVectorRegs(RetTy); // VPERM
1577
1578 return -1;
1579}
1580
1590
1592 // Always expand on Subtargets without vector instructions.
1593 if (!ST->hasVector())
1594 return true;
1595
1596 // Whether or not to expand is a per-intrinsic decision.
1597 switch (II->getIntrinsicID()) {
1598 default:
1599 return true;
1600 // Do not expand vector.reduce.add...
1601 case Intrinsic::vector_reduce_add:
1602 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
1603 // ...unless the scalar size is i64 or larger,
1604 // or the operand vector is not full, since the
1605 // performance benefit is dubious in those cases.
1606 return VType->getScalarSizeInBits() >= 64 ||
1607 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1608 }
1609}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Expand Atomic instructions
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static unsigned InstrCount
Hexagon Common GEP
const HexagonInstrInfo * TII
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:119
bool customCostReductions(unsigned Opcode)
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits)
static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, unsigned &NumLoads, const Function *F)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static bool isFoldableRMW(const Instruction *I, Type *Ty)
static bool isFreeEltLoad(const Value *Op)
InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, unsigned ScalarBits)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl< Type * > &ParamTys)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
size_type count() const
Returns the number of bits which are set.
Definition BitVector.h:181
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
This class is the base class for the comparison instructions.
Definition InstrTypes.h:728
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:764
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This is an important base class in LLVM.
Definition Constant.h:43
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
This instruction compares its operands according to the predicate given to the constructor.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class wraps the llvm.memcpy intrinsic.
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
Estimate the overhead of scalarizing an instruction.
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) const
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I) const
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool hasDivRemOp(Type *DataType, bool IsSigned) const override
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
unsigned adjustInliningThreshold(const CallBase *CB) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMaxInterleaveFactor(ElementCount VF, bool HasUnorderedReductions) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
virtual bool isLoweredToCall(const Function *F) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
iterator_range< user_iterator > users()
Definition Value.h:426
Base class of all SIMD vector types.
const ParentTy * getParent() const
Definition ilist_node.h:34
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
const unsigned VectorBits
Definition SystemZ.h:155
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
DWARFExpression::Operation Op
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...