LLVM 18.0.0git
PPCTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
17#include "llvm/IR/IntrinsicsPowerPC.h"
20#include "llvm/Support/Debug.h"
24#include <optional>
25
26using namespace llvm;
27
28#define DEBUG_TYPE "ppctti"
29
30static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
31cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
32
33static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
34cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
35
36static cl::opt<bool>
37EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
38 cl::desc("Enable using coldcc calling conv for cold "
39 "internal functions"));
40
41static cl::opt<bool>
42LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
43 cl::desc("Do not add instruction count to lsr cost model"));
44
45// The latency of mtctr is only justified if there are more than 4
46// comparisons that will be removed as a result.
48SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
49 cl::desc("Loops with a constant trip count smaller than "
50 "this value will not use the count register."));
51
52//===----------------------------------------------------------------------===//
53//
54// PPC cost model.
55//
56//===----------------------------------------------------------------------===//
57
60 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
61 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
64 return TTI::PSK_Software;
65}
66
67std::optional<Instruction *>
70 switch (IID) {
71 default:
72 break;
73 case Intrinsic::ppc_altivec_lvx:
74 case Intrinsic::ppc_altivec_lvxl:
75 // Turn PPC lvx -> load if the pointer is known aligned.
77 II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
78 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
81 return new LoadInst(II.getType(), Ptr, "", false, Align(16));
82 }
83 break;
84 case Intrinsic::ppc_vsx_lxvw4x:
85 case Intrinsic::ppc_vsx_lxvd2x: {
86 // Turn PPC VSX loads into normal loads.
89 return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
90 }
91 case Intrinsic::ppc_altivec_stvx:
92 case Intrinsic::ppc_altivec_stvxl:
93 // Turn stvx -> store if the pointer is known aligned.
95 II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
96 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
98 Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
99 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
100 }
101 break;
102 case Intrinsic::ppc_vsx_stxvw4x:
103 case Intrinsic::ppc_vsx_stxvd2x: {
104 // Turn PPC VSX stores into normal stores.
106 Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
107 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
108 }
109 case Intrinsic::ppc_altivec_vperm:
110 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
111 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
112 // a vectorshuffle for little endian, we must undo the transformation
113 // performed on vec_perm in altivec.h. That is, we must complement
114 // the permutation mask with respect to 31 and reverse the order of
115 // V1 and V2.
116 if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
117 assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
118 "Bad type for intrinsic!");
119
120 // Check that all of the elements are integer constants or undefs.
121 bool AllEltsOk = true;
122 for (unsigned i = 0; i != 16; ++i) {
123 Constant *Elt = Mask->getAggregateElement(i);
124 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
125 AllEltsOk = false;
126 break;
127 }
128 }
129
130 if (AllEltsOk) {
131 // Cast the input vectors to byte vectors.
132 Value *Op0 =
133 IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
134 Value *Op1 =
135 IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
136 Value *Result = UndefValue::get(Op0->getType());
137
138 // Only extract each element once.
139 Value *ExtractedElts[32];
140 memset(ExtractedElts, 0, sizeof(ExtractedElts));
141
142 for (unsigned i = 0; i != 16; ++i) {
143 if (isa<UndefValue>(Mask->getAggregateElement(i)))
144 continue;
145 unsigned Idx =
146 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
147 Idx &= 31; // Match the hardware behavior.
148 if (DL.isLittleEndian())
149 Idx = 31 - Idx;
150
151 if (!ExtractedElts[Idx]) {
152 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
153 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
154 ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
155 Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
156 }
157
158 // Insert this value into the result vector.
159 Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
160 IC.Builder.getInt32(i));
161 }
162 return CastInst::Create(Instruction::BitCast, Result, II.getType());
163 }
164 }
165 break;
166 }
167 return std::nullopt;
168}
169
173 return BaseT::getIntImmCost(Imm, Ty, CostKind);
174
175 assert(Ty->isIntegerTy());
176
177 unsigned BitSize = Ty->getPrimitiveSizeInBits();
178 if (BitSize == 0)
179 return ~0U;
180
181 if (Imm == 0)
182 return TTI::TCC_Free;
183
184 if (Imm.getBitWidth() <= 64) {
185 if (isInt<16>(Imm.getSExtValue()))
186 return TTI::TCC_Basic;
187
188 if (isInt<32>(Imm.getSExtValue())) {
189 // A constant that can be materialized using lis.
190 if ((Imm.getZExtValue() & 0xFFFF) == 0)
191 return TTI::TCC_Basic;
192
193 return 2 * TTI::TCC_Basic;
194 }
195 }
196
197 return 4 * TTI::TCC_Basic;
198}
199
201 const APInt &Imm, Type *Ty,
204 return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
205
206 assert(Ty->isIntegerTy());
207
208 unsigned BitSize = Ty->getPrimitiveSizeInBits();
209 if (BitSize == 0)
210 return ~0U;
211
212 switch (IID) {
213 default:
214 return TTI::TCC_Free;
215 case Intrinsic::sadd_with_overflow:
216 case Intrinsic::uadd_with_overflow:
217 case Intrinsic::ssub_with_overflow:
218 case Intrinsic::usub_with_overflow:
219 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
220 return TTI::TCC_Free;
221 break;
222 case Intrinsic::experimental_stackmap:
223 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
224 return TTI::TCC_Free;
225 break;
226 case Intrinsic::experimental_patchpoint_void:
227 case Intrinsic::experimental_patchpoint_i64:
228 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
229 return TTI::TCC_Free;
230 break;
231 }
232 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
233}
234
236 const APInt &Imm, Type *Ty,
238 Instruction *Inst) {
240 return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
241
242 assert(Ty->isIntegerTy());
243
244 unsigned BitSize = Ty->getPrimitiveSizeInBits();
245 if (BitSize == 0)
246 return ~0U;
247
248 unsigned ImmIdx = ~0U;
249 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
250 ZeroFree = false;
251 switch (Opcode) {
252 default:
253 return TTI::TCC_Free;
254 case Instruction::GetElementPtr:
255 // Always hoist the base address of a GetElementPtr. This prevents the
256 // creation of new constants for every base constant that gets constant
257 // folded with the offset.
258 if (Idx == 0)
259 return 2 * TTI::TCC_Basic;
260 return TTI::TCC_Free;
261 case Instruction::And:
262 RunFree = true; // (for the rotate-and-mask instructions)
263 [[fallthrough]];
264 case Instruction::Add:
265 case Instruction::Or:
266 case Instruction::Xor:
267 ShiftedFree = true;
268 [[fallthrough]];
269 case Instruction::Sub:
270 case Instruction::Mul:
271 case Instruction::Shl:
272 case Instruction::LShr:
273 case Instruction::AShr:
274 ImmIdx = 1;
275 break;
276 case Instruction::ICmp:
277 UnsignedFree = true;
278 ImmIdx = 1;
279 // Zero comparisons can use record-form instructions.
280 [[fallthrough]];
281 case Instruction::Select:
282 ZeroFree = true;
283 break;
284 case Instruction::PHI:
285 case Instruction::Call:
286 case Instruction::Ret:
287 case Instruction::Load:
288 case Instruction::Store:
289 break;
290 }
291
292 if (ZeroFree && Imm == 0)
293 return TTI::TCC_Free;
294
295 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
296 if (isInt<16>(Imm.getSExtValue()))
297 return TTI::TCC_Free;
298
299 if (RunFree) {
300 if (Imm.getBitWidth() <= 32 &&
301 (isShiftedMask_32(Imm.getZExtValue()) ||
302 isShiftedMask_32(~Imm.getZExtValue())))
303 return TTI::TCC_Free;
304
305 if (ST->isPPC64() &&
306 (isShiftedMask_64(Imm.getZExtValue()) ||
307 isShiftedMask_64(~Imm.getZExtValue())))
308 return TTI::TCC_Free;
309 }
310
311 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
312 return TTI::TCC_Free;
313
314 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
315 return TTI::TCC_Free;
316 }
317
318 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
319}
320
321// Check if the current Type is an MMA vector type. Valid MMA types are
322// v256i1 and v512i1 respectively.
323static bool isMMAType(Type *Ty) {
324 return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
325 (Ty->getPrimitiveSizeInBits() > 128);
326}
327
331 // We already implement getCastInstrCost and getMemoryOpCost where we perform
332 // the vector adjustment there.
333 if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
335
336 if (U->getType()->isVectorTy()) {
337 // Instructions that need to be split should cost more.
338 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(U->getType());
339 return LT.first * BaseT::getInstructionCost(U, Operands, CostKind);
340 }
341
343}
344
346 AssumptionCache &AC,
347 TargetLibraryInfo *LibInfo,
348 HardwareLoopInfo &HWLoopInfo) {
349 const PPCTargetMachine &TM = ST->getTargetMachine();
350 TargetSchedModel SchedModel;
351 SchedModel.init(ST);
352
353 // Do not convert small short loops to CTR loop.
354 unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
355 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
357 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
359 for (BasicBlock *BB : L->blocks())
360 Metrics.analyzeBasicBlock(BB, *this, EphValues);
361 // 6 is an approximate latency for the mtctr instruction.
362 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
363 return false;
364 }
365
366 // Check that there is no hardware loop related intrinsics in the loop.
367 for (auto *BB : L->getBlocks())
368 for (auto &I : *BB)
369 if (auto *Call = dyn_cast<IntrinsicInst>(&I))
370 if (Call->getIntrinsicID() == Intrinsic::set_loop_iterations ||
371 Call->getIntrinsicID() == Intrinsic::loop_decrement)
372 return false;
373
374 SmallVector<BasicBlock*, 4> ExitingBlocks;
375 L->getExitingBlocks(ExitingBlocks);
376
377 // If there is an exit edge known to be frequently taken,
378 // we should not transform this loop.
379 for (auto &BB : ExitingBlocks) {
380 Instruction *TI = BB->getTerminator();
381 if (!TI) continue;
382
383 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
384 uint64_t TrueWeight = 0, FalseWeight = 0;
385 if (!BI->isConditional() ||
386 !extractBranchWeights(*BI, TrueWeight, FalseWeight))
387 continue;
388
389 // If the exit path is more frequent than the loop path,
390 // we return here without further analysis for this loop.
391 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
392 if (( TrueIsExit && FalseWeight < TrueWeight) ||
393 (!TrueIsExit && FalseWeight > TrueWeight))
394 return false;
395 }
396 }
397
398 LLVMContext &C = L->getHeader()->getContext();
399 HWLoopInfo.CountType = TM.isPPC64() ?
401 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
402 return true;
403}
404
408 if (ST->getCPUDirective() == PPC::DIR_A2) {
409 // The A2 is in-order with a deep pipeline, and concatenation unrolling
410 // helps expose latency-hiding opportunities to the instruction scheduler.
411 UP.Partial = UP.Runtime = true;
412
413 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
414 // often outweigh the cost of a division to compute the trip count.
415 UP.AllowExpensiveTripCount = true;
416 }
417
418 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
419}
420
424}
425// This function returns true to allow using coldcc calling convention.
426// Returning true results in coldcc being used for functions which are cold at
427// all call sites when the callers of the functions are not calling any other
428// non coldcc functions.
430 return EnablePPCColdCC;
431}
432
433bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
434 // On the A2, always unroll aggressively.
435 if (ST->getCPUDirective() == PPC::DIR_A2)
436 return true;
437
438 return LoopHasReductions;
439}
440
442PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
444 Options.LoadSizes = {8, 4, 2, 1};
445 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
446 return Options;
447}
448
450 return true;
451}
452
453unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
454 assert(ClassID == GPRRC || ClassID == FPRRC ||
455 ClassID == VRRC || ClassID == VSXRC);
456 if (ST->hasVSX()) {
457 assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
458 return ClassID == VSXRC ? 64 : 32;
459 }
460 assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
461 return 32;
462}
463
465 if (Vector)
466 return ST->hasVSX() ? VSXRC : VRRC;
467 else if (Ty && (Ty->getScalarType()->isFloatTy() ||
468 Ty->getScalarType()->isDoubleTy()))
469 return ST->hasVSX() ? VSXRC : FPRRC;
470 else if (Ty && (Ty->getScalarType()->isFP128Ty() ||
472 return VRRC;
473 else if (Ty && Ty->getScalarType()->isHalfTy())
474 return VSXRC;
475 else
476 return GPRRC;
477}
478
479const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
480
481 switch (ClassID) {
482 default:
483 llvm_unreachable("unknown register class");
484 return "PPC::unknown register class";
485 case GPRRC: return "PPC::GPRRC";
486 case FPRRC: return "PPC::FPRRC";
487 case VRRC: return "PPC::VRRC";
488 case VSXRC: return "PPC::VSXRC";
489 }
490}
491
494 switch (K) {
496 return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
498 return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
500 return TypeSize::getScalable(0);
501 }
502
503 llvm_unreachable("Unsupported register kind");
504}
505
507 // Starting with P7 we have a cache line size of 128.
508 unsigned Directive = ST->getCPUDirective();
509 // Assume that Future CPU has the same cache line size as the others.
513 return 128;
514
515 // On other processors return a default of 64 bytes.
516 return 64;
517}
518
520 return 300;
521}
522
524 unsigned Directive = ST->getCPUDirective();
525 // The 440 has no SIMD support, but floating-point instructions
526 // have a 5-cycle latency, so unroll by 5x for latency hiding.
527 if (Directive == PPC::DIR_440)
528 return 5;
529
530 // The A2 has no SIMD support, but floating-point instructions
531 // have a 6-cycle latency, so unroll by 6x for latency hiding.
532 if (Directive == PPC::DIR_A2)
533 return 6;
534
535 // FIXME: For lack of any better information, do no harm...
537 return 1;
538
539 // For P7 and P8, floating-point instructions have a 6-cycle latency and
540 // there are two execution units, so unroll by 12x for latency hiding.
541 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
542 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
543 // Assume that future is the same as the others.
547 return 12;
548
549 // For most things, modern systems have two execution units (and
550 // out-of-order execution).
551 return 2;
552}
553
554// Returns a cost adjustment factor to adjust the cost of vector instructions
555// on targets which there is overlap between the vector and scalar units,
556// thereby reducing the overall throughput of vector code wrt. scalar code.
557// An invalid instruction cost is returned if the type is an MMA vector type.
559 Type *Ty1, Type *Ty2) {
560 // If the vector type is of an MMA type (v256i1, v512i1), an invalid
561 // instruction cost is returned. This is to signify to other cost computing
562 // functions to return the maximum instruction cost in order to prevent any
563 // opportunities for the optimizer to produce MMA types within the IR.
564 if (isMMAType(Ty1))
566
567 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
568 return InstructionCost(1);
569
570 std::pair<InstructionCost, MVT> LT1 = getTypeLegalizationCost(Ty1);
571 // If type legalization involves splitting the vector, we don't want to
572 // double the cost at every step - only the last step.
573 if (LT1.first != 1 || !LT1.second.isVector())
574 return InstructionCost(1);
575
576 int ISD = TLI->InstructionOpcodeToISD(Opcode);
577 if (TLI->isOperationExpand(ISD, LT1.second))
578 return InstructionCost(1);
579
580 if (Ty2) {
581 std::pair<InstructionCost, MVT> LT2 = getTypeLegalizationCost(Ty2);
582 if (LT2.first != 1 || !LT2.second.isVector())
583 return InstructionCost(1);
584 }
585
586 return InstructionCost(2);
587}
588
590 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
593 const Instruction *CxtI) {
594 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
595
596 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
597 if (!CostFactor.isValid())
599
600 // TODO: Handle more cost kinds.
602 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
603 Op2Info, Args, CxtI);
604
605 // Fallback to the default implementation.
607 Opcode, Ty, CostKind, Op1Info, Op2Info);
608 return Cost * CostFactor;
609}
610
612 ArrayRef<int> Mask,
614 int Index, Type *SubTp,
616
617 InstructionCost CostFactor =
618 vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
619 if (!CostFactor.isValid())
621
622 // Legalize the type.
623 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
624
625 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
626 // (at least in the sense that there need only be one non-loop-invariant
627 // instruction). We need one such shuffle instruction for each actual
628 // register (this is not true for arbitrary shuffles, but is true for the
629 // structured types of shuffles covered by TTI::ShuffleKind).
630 return LT.first * CostFactor;
631}
632
635 const Instruction *I) {
637 return Opcode == Instruction::PHI ? 0 : 1;
638 // Branches are assumed to be predicted.
639 return 0;
640}
641
643 Type *Src,
646 const Instruction *I) {
647 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
648
649 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
650 if (!CostFactor.isValid())
652
654 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
655 Cost *= CostFactor;
656 // TODO: Allow non-throughput costs that aren't binary.
658 return Cost == 0 ? 0 : 1;
659 return Cost;
660}
661
663 Type *CondTy,
664 CmpInst::Predicate VecPred,
666 const Instruction *I) {
667 InstructionCost CostFactor =
668 vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
669 if (!CostFactor.isValid())
671
673 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
674 // TODO: Handle other cost kinds.
676 return Cost;
677 return Cost * CostFactor;
678}
679
682 unsigned Index, Value *Op0,
683 Value *Op1) {
684 assert(Val->isVectorTy() && "This must be a vector type");
685
686 int ISD = TLI->InstructionOpcodeToISD(Opcode);
687 assert(ISD && "Invalid opcode");
688
689 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
690 if (!CostFactor.isValid())
692
694 BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
695 Cost *= CostFactor;
696
697 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
698 // Double-precision scalars are already located in index #0 (or #1 if LE).
699 if (ISD == ISD::EXTRACT_VECTOR_ELT &&
700 Index == (ST->isLittleEndian() ? 1 : 0))
701 return 0;
702
703 return Cost;
704
705 } else if (Val->getScalarType()->isIntegerTy() && Index != -1U) {
706 unsigned EltSize = Val->getScalarSizeInBits();
707 // Computing on 1 bit values requires extra mask or compare operations.
708 unsigned MaskCost = VecMaskCost && EltSize == 1 ? 1 : 0;
709 if (ST->hasP9Altivec()) {
710 if (ISD == ISD::INSERT_VECTOR_ELT)
711 // A move-to VSR and a permute/insert. Assume vector operation cost
712 // for both (cost will be 2x on P9).
713 return 2 * CostFactor;
714
715 // It's an extract. Maybe we can do a cheap move-from VSR.
716 unsigned EltSize = Val->getScalarSizeInBits();
717 if (EltSize == 64) {
718 unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
719 if (Index == MfvsrdIndex)
720 return 1;
721 } else if (EltSize == 32) {
722 unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
723 if (Index == MfvsrwzIndex)
724 return 1;
725 }
726
727 // We need a vector extract (or mfvsrld). Assume vector operation cost.
728 // The cost of the load constant for a vector extract is disregarded
729 // (invariant, easily schedulable).
730 return CostFactor + MaskCost;
731
732 } else if (ST->hasDirectMove()) {
733 // Assume permute has standard cost.
734 // Assume move-to/move-from VSR have 2x standard cost.
735 if (ISD == ISD::INSERT_VECTOR_ELT)
736 return 3;
737 return 3 + MaskCost;
738 }
739 }
740
741 // Estimated cost of a load-hit-store delay. This was obtained
742 // experimentally as a minimum needed to prevent unprofitable
743 // vectorization for the paq8p benchmark. It may need to be
744 // raised further if other unprofitable cases remain.
745 unsigned LHSPenalty = 2;
746 if (ISD == ISD::INSERT_VECTOR_ELT)
747 LHSPenalty += 7;
748
749 // Vector element insert/extract with Altivec is very expensive,
750 // because they require store and reload with the attendant
751 // processor stall for load-hit-store. Until VSX is available,
752 // these need to be estimated as very costly.
753 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
755 return LHSPenalty + Cost;
756
757 return Cost;
758}
759
761 MaybeAlign Alignment,
762 unsigned AddressSpace,
765 const Instruction *I) {
766
767 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
768 if (!CostFactor.isValid())
770
771 if (TLI->getValueType(DL, Src, true) == MVT::Other)
772 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
773 CostKind);
774 // Legalize the type.
775 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
776 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
777 "Invalid Opcode");
778
780 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
781 // TODO: Handle other cost kinds.
783 return Cost;
784
785 Cost *= CostFactor;
786
787 bool IsAltivecType = ST->hasAltivec() &&
788 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
789 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
790 bool IsVSXType = ST->hasVSX() &&
791 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
792
793 // VSX has 32b/64b load instructions. Legalization can handle loading of
794 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
795 // PPCTargetLowering can't compute the cost appropriately. So here we
796 // explicitly check this case.
797 unsigned MemBytes = Src->getPrimitiveSizeInBits();
798 if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
799 (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
800 return 1;
801
802 // Aligned loads and stores are easy.
803 unsigned SrcBytes = LT.second.getStoreSize();
804 if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
805 return Cost;
806
807 // If we can use the permutation-based load sequence, then this is also
808 // relatively cheap (not counting loop-invariant instructions): one load plus
809 // one permute (the last load in a series has extra cost, but we're
810 // neglecting that here). Note that on the P7, we could do unaligned loads
811 // for Altivec types using the VSX instructions, but that's more expensive
812 // than using the permutation-based load sequence. On the P8, that's no
813 // longer true.
814 if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
815 *Alignment >= LT.second.getScalarType().getStoreSize())
816 return Cost + LT.first; // Add the cost of the permutations.
817
818 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
819 // P7, unaligned vector loads are more expensive than the permutation-based
820 // load sequence, so that might be used instead, but regardless, the net cost
821 // is about the same (not counting loop-invariant instructions).
822 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
823 return Cost;
824
825 // Newer PPC supports unaligned memory access.
826 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
827 return Cost;
828
829 // PPC in general does not support unaligned loads and stores. They'll need
830 // to be decomposed based on the alignment factor.
831
832 // Add the cost of each scalar load or store.
833 assert(Alignment);
834 Cost += LT.first * ((SrcBytes / Alignment->value()) - 1);
835
836 // For a vector type, there is also scalarization overhead (only for
837 // stores, loads are expanded using the vector-load + permutation sequence,
838 // which is much less expensive).
839 if (Src->isVectorTy() && Opcode == Instruction::Store)
840 for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
841 ++i)
842 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i,
843 nullptr, nullptr);
844
845 return Cost;
846}
847
849 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
850 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
851 bool UseMaskForCond, bool UseMaskForGaps) {
852 InstructionCost CostFactor =
853 vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
854 if (!CostFactor.isValid())
856
857 if (UseMaskForCond || UseMaskForGaps)
858 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
859 Alignment, AddressSpace, CostKind,
860 UseMaskForCond, UseMaskForGaps);
861
862 assert(isa<VectorType>(VecTy) &&
863 "Expect a vector type for interleaved memory op");
864
865 // Legalize the type.
866 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
867
868 // Firstly, the cost of load/store operation.
869 InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment),
871
872 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
873 // (at least in the sense that there need only be one non-loop-invariant
874 // instruction). For each result vector, we need one shuffle per incoming
875 // vector (except that the first shuffle can take two incoming vectors
876 // because it does not need to take itself).
877 Cost += Factor*(LT.first-1);
878
879 return Cost;
880}
881
886}
887
889 const Function *Callee,
890 const ArrayRef<Type *> &Types) const {
891
892 // We need to ensure that argument promotion does not
893 // attempt to promote pointers to MMA types (__vector_pair
894 // and __vector_quad) since these types explicitly cannot be
895 // passed as arguments. Both of these types are larger than
896 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
897 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
898 return false;
899
900 return llvm::none_of(Types, [](Type *Ty) {
901 if (Ty->isSized())
902 return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
903 return false;
904 });
905}
906
908 LoopInfo *LI, DominatorTree *DT,
909 AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
910 // Process nested loops first.
911 for (Loop *I : *L)
912 if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
913 return false; // Stop search.
914
915 HardwareLoopInfo HWLoopInfo(L);
916
917 if (!HWLoopInfo.canAnalyze(*LI))
918 return false;
919
920 if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
921 return false;
922
923 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
924 return false;
925
926 *BI = HWLoopInfo.ExitBranch;
927 return true;
928}
929
932 // PowerPC default behaviour here is "instruction number 1st priority".
933 // If LsrNoInsnsCost is set, call default implementation.
934 if (!LsrNoInsnsCost)
935 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
936 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
937 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
938 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
939 else
941}
942
944 return false;
945}
946
948 const PPCTargetMachine &TM = ST->getTargetMachine();
949 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
950 if (!TM.isELFv2ABI())
951 return false;
953}
954
956 MemIntrinsicInfo &Info) {
957 switch (Inst->getIntrinsicID()) {
958 case Intrinsic::ppc_altivec_lvx:
959 case Intrinsic::ppc_altivec_lvxl:
960 case Intrinsic::ppc_altivec_lvebx:
961 case Intrinsic::ppc_altivec_lvehx:
962 case Intrinsic::ppc_altivec_lvewx:
963 case Intrinsic::ppc_vsx_lxvd2x:
964 case Intrinsic::ppc_vsx_lxvw4x:
965 case Intrinsic::ppc_vsx_lxvd2x_be:
966 case Intrinsic::ppc_vsx_lxvw4x_be:
967 case Intrinsic::ppc_vsx_lxvl:
968 case Intrinsic::ppc_vsx_lxvll:
969 case Intrinsic::ppc_vsx_lxvp: {
970 Info.PtrVal = Inst->getArgOperand(0);
971 Info.ReadMem = true;
972 Info.WriteMem = false;
973 return true;
974 }
975 case Intrinsic::ppc_altivec_stvx:
976 case Intrinsic::ppc_altivec_stvxl:
977 case Intrinsic::ppc_altivec_stvebx:
978 case Intrinsic::ppc_altivec_stvehx:
979 case Intrinsic::ppc_altivec_stvewx:
980 case Intrinsic::ppc_vsx_stxvd2x:
981 case Intrinsic::ppc_vsx_stxvw4x:
982 case Intrinsic::ppc_vsx_stxvd2x_be:
983 case Intrinsic::ppc_vsx_stxvw4x_be:
984 case Intrinsic::ppc_vsx_stxvl:
985 case Intrinsic::ppc_vsx_stxvll:
986 case Intrinsic::ppc_vsx_stxvp: {
987 Info.PtrVal = Inst->getArgOperand(1);
988 Info.ReadMem = false;
989 Info.WriteMem = true;
990 return true;
991 }
992 case Intrinsic::ppc_stbcx:
993 case Intrinsic::ppc_sthcx:
994 case Intrinsic::ppc_stdcx:
995 case Intrinsic::ppc_stwcx: {
996 Info.PtrVal = Inst->getArgOperand(0);
997 Info.ReadMem = false;
998 Info.WriteMem = true;
999 return true;
1000 }
1001 default:
1002 break;
1003 }
1004
1005 return false;
1006}
1007
1008bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
1009 Align Alignment) const {
1010 // Only load and stores instructions can have variable vector length on Power.
1011 if (Opcode != Instruction::Load && Opcode != Instruction::Store)
1012 return false;
1013 // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1014 // therefore cannot be used in 32-bit mode.
1015 if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
1016 return false;
1017 if (isa<FixedVectorType>(DataType)) {
1018 unsigned VecWidth = DataType->getPrimitiveSizeInBits();
1019 return VecWidth == 128;
1020 }
1021 Type *ScalarTy = DataType->getScalarType();
1022
1023 if (ScalarTy->isPointerTy())
1024 return true;
1025
1026 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
1027 return true;
1028
1029 if (!ScalarTy->isIntegerTy())
1030 return false;
1031
1032 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
1033 return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
1034}
1035
1037 Align Alignment,
1038 unsigned AddressSpace,
1040 const Instruction *I) {
1041 InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
1043 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1044 return Cost;
1045 // TODO: Handle other cost kinds.
1047 return Cost;
1048
1049 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1050 "Invalid Opcode");
1051
1052 auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
1053 assert(SrcVTy && "Expected a vector type for VP memory operations");
1054
1055 if (hasActiveVectorLength(Opcode, Src, Alignment)) {
1056 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
1057
1058 InstructionCost CostFactor =
1059 vectorCostAdjustmentFactor(Opcode, Src, nullptr);
1060 if (!CostFactor.isValid())
1061 return InstructionCost::getMax();
1062
1063 InstructionCost Cost = LT.first * CostFactor;
1064 assert(Cost.isValid() && "Expected valid cost");
1065
1066 // On P9 but not on P10, if the op is misaligned then it will cause a
1067 // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1068 // ones.
1069 const Align DesiredAlignment(16);
1070 if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
1071 return Cost;
1072
1073 // Since alignment may be under estimated, we try to compute the probability
1074 // that the actual address is aligned to the desired boundary. For example
1075 // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1076 // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1077 // aligned.
1078 float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
1079 float MisalignmentProb = 1.0 - AlignmentProb;
1080 return (MisalignmentProb * P9PipelineFlushEstimate) +
1081 (AlignmentProb * *Cost.getValue());
1082 }
1083
1084 // Usually we should not get to this point, but the following is an attempt to
1085 // model the cost of legalization. Currently we can only lower intrinsics with
1086 // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1087 return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1088}
1089
1091 return TLI->supportsTailCallFor(CB);
1092}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
Machine Trace Metrics
static cl::opt< bool > VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden)
static cl::opt< bool > DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden)
static cl::opt< unsigned > SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, cl::desc("Loops with a constant trip count smaller than " "this value will not use the count register."))
static bool isMMAType(Type *Ty)
static cl::opt< bool > EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions"))
static cl::opt< bool > LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false), cl::desc("Do not add instruction count to lsr cost model"))
This file a TargetTransformInfo::Concept conforming object specific to the PPC target machine.
if(VerifyEach)
const char LLVMTargetMachineRef TM
This file contains the declarations for profiling metadata utility functions.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:547
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:856
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:619
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:820
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1357
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", Instruction *InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
This is an important base class in LLVM.
Definition: Constant.h:41
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2431
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2419
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:472
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2089
The core instruction combiner logic.
Definition: InstCombiner.h:46
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:379
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:378
BuilderTy & Builder
Definition: InstCombiner.h:59
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:376
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
The optimization diagnostic interface.
bool isPPC64() const
isPPC64 - Return true if we are generating code for 64-bit pointer mode.
unsigned getCPUDirective() const
getCPUDirective - Returns the -m directive specified for the cpu.
Definition: PPCSubtarget.h:134
POPCNTDKind hasPOPCNTD() const
Definition: PPCSubtarget.h:206
bool isLittleEndian() const
Definition: PPCSubtarget.h:181
const PPCTargetMachine & getTargetMachine() const
Definition: PPCSubtarget.h:155
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
const char * getRegisterClassName(unsigned ClassID) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useColdCCForColdCall(Function &F)
InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1, Type *Ty2)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool supportsTailCallFor(const CallBase *CB) const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getCacheLineSize() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
unsigned getMaxInterleaveFactor(ElementCount VF)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool enableAggressiveInterleaving(bool LoopHasReductions)
unsigned getPrefetchDistance() const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
unsigned getNumberOfRegisters(unsigned ClassID) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
bool shouldBuildRelLookupTables() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo)
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool supportsTailCallFor(const CallBase *CB) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Is unaligned memory access allowed for the given type, and is it fast relative to software emulation.
Common code between 32-bit and 64-bit PowerPC targets.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:659
The main scalar evolution driver.
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
Provide an instruction scheduling machine model to CodeGen passes.
unsigned getIssueWidth() const
Maximum number of micro-ops that may be scheduled per cycle.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
CastContextHint
Represents a hint about the context in which a cast is used.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1724
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ DIR_PWR_FUTURE
Definition: PPCSubtarget.h:64
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:252
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1439
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1741
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:31
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...