LLVM 19.0.0git
PPCTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
17#include "llvm/IR/IntrinsicsPowerPC.h"
20#include "llvm/Support/Debug.h"
23#include <optional>
24
25using namespace llvm;
26
27#define DEBUG_TYPE "ppctti"
28
29static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
30cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
31
32static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
33cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
34
35static cl::opt<bool>
36EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
37 cl::desc("Enable using coldcc calling conv for cold "
38 "internal functions"));
39
40static cl::opt<bool>
41LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
42 cl::desc("Do not add instruction count to lsr cost model"));
43
44// The latency of mtctr is only justified if there are more than 4
45// comparisons that will be removed as a result.
47SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
48 cl::desc("Loops with a constant trip count smaller than "
49 "this value will not use the count register."));
50
51//===----------------------------------------------------------------------===//
52//
53// PPC cost model.
54//
55//===----------------------------------------------------------------------===//
56
59 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
60 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
63 return TTI::PSK_Software;
64}
65
66std::optional<Instruction *>
69 switch (IID) {
70 default:
71 break;
72 case Intrinsic::ppc_altivec_lvx:
73 case Intrinsic::ppc_altivec_lvxl:
74 // Turn PPC lvx -> load if the pointer is known aligned.
76 II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
77 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
78 Value *Ptr = II.getArgOperand(0);
79 return new LoadInst(II.getType(), Ptr, "", false, Align(16));
80 }
81 break;
82 case Intrinsic::ppc_vsx_lxvw4x:
83 case Intrinsic::ppc_vsx_lxvd2x: {
84 // Turn PPC VSX loads into normal loads.
85 Value *Ptr = II.getArgOperand(0);
86 return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
87 }
88 case Intrinsic::ppc_altivec_stvx:
89 case Intrinsic::ppc_altivec_stvxl:
90 // Turn stvx -> store if the pointer is known aligned.
92 II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
93 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
94 Value *Ptr = II.getArgOperand(1);
95 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
96 }
97 break;
98 case Intrinsic::ppc_vsx_stxvw4x:
99 case Intrinsic::ppc_vsx_stxvd2x: {
100 // Turn PPC VSX stores into normal stores.
101 Value *Ptr = II.getArgOperand(1);
102 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
103 }
104 case Intrinsic::ppc_altivec_vperm:
105 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
106 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
107 // a vectorshuffle for little endian, we must undo the transformation
108 // performed on vec_perm in altivec.h. That is, we must complement
109 // the permutation mask with respect to 31 and reverse the order of
110 // V1 and V2.
111 if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
112 assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
113 "Bad type for intrinsic!");
114
115 // Check that all of the elements are integer constants or undefs.
116 bool AllEltsOk = true;
117 for (unsigned i = 0; i != 16; ++i) {
118 Constant *Elt = Mask->getAggregateElement(i);
119 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
120 AllEltsOk = false;
121 break;
122 }
123 }
124
125 if (AllEltsOk) {
126 // Cast the input vectors to byte vectors.
127 Value *Op0 =
128 IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
129 Value *Op1 =
130 IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
131 Value *Result = UndefValue::get(Op0->getType());
132
133 // Only extract each element once.
134 Value *ExtractedElts[32];
135 memset(ExtractedElts, 0, sizeof(ExtractedElts));
136
137 for (unsigned i = 0; i != 16; ++i) {
138 if (isa<UndefValue>(Mask->getAggregateElement(i)))
139 continue;
140 unsigned Idx =
141 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
142 Idx &= 31; // Match the hardware behavior.
143 if (DL.isLittleEndian())
144 Idx = 31 - Idx;
145
146 if (!ExtractedElts[Idx]) {
147 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
148 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
149 ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
150 Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
151 }
152
153 // Insert this value into the result vector.
154 Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
155 IC.Builder.getInt32(i));
156 }
157 return CastInst::Create(Instruction::BitCast, Result, II.getType());
158 }
159 }
160 break;
161 }
162 return std::nullopt;
163}
164
168 return BaseT::getIntImmCost(Imm, Ty, CostKind);
169
170 assert(Ty->isIntegerTy());
171
172 unsigned BitSize = Ty->getPrimitiveSizeInBits();
173 if (BitSize == 0)
174 return ~0U;
175
176 if (Imm == 0)
177 return TTI::TCC_Free;
178
179 if (Imm.getBitWidth() <= 64) {
180 if (isInt<16>(Imm.getSExtValue()))
181 return TTI::TCC_Basic;
182
183 if (isInt<32>(Imm.getSExtValue())) {
184 // A constant that can be materialized using lis.
185 if ((Imm.getZExtValue() & 0xFFFF) == 0)
186 return TTI::TCC_Basic;
187
188 return 2 * TTI::TCC_Basic;
189 }
190 }
191
192 return 4 * TTI::TCC_Basic;
193}
194
196 const APInt &Imm, Type *Ty,
199 return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
200
201 assert(Ty->isIntegerTy());
202
203 unsigned BitSize = Ty->getPrimitiveSizeInBits();
204 if (BitSize == 0)
205 return ~0U;
206
207 switch (IID) {
208 default:
209 return TTI::TCC_Free;
210 case Intrinsic::sadd_with_overflow:
211 case Intrinsic::uadd_with_overflow:
212 case Intrinsic::ssub_with_overflow:
213 case Intrinsic::usub_with_overflow:
214 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
215 return TTI::TCC_Free;
216 break;
217 case Intrinsic::experimental_stackmap:
218 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
219 return TTI::TCC_Free;
220 break;
221 case Intrinsic::experimental_patchpoint_void:
222 case Intrinsic::experimental_patchpoint_i64:
223 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
224 return TTI::TCC_Free;
225 break;
226 }
227 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
228}
229
231 const APInt &Imm, Type *Ty,
233 Instruction *Inst) {
235 return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
236
237 assert(Ty->isIntegerTy());
238
239 unsigned BitSize = Ty->getPrimitiveSizeInBits();
240 if (BitSize == 0)
241 return ~0U;
242
243 unsigned ImmIdx = ~0U;
244 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
245 ZeroFree = false;
246 switch (Opcode) {
247 default:
248 return TTI::TCC_Free;
249 case Instruction::GetElementPtr:
250 // Always hoist the base address of a GetElementPtr. This prevents the
251 // creation of new constants for every base constant that gets constant
252 // folded with the offset.
253 if (Idx == 0)
254 return 2 * TTI::TCC_Basic;
255 return TTI::TCC_Free;
256 case Instruction::And:
257 RunFree = true; // (for the rotate-and-mask instructions)
258 [[fallthrough]];
259 case Instruction::Add:
260 case Instruction::Or:
261 case Instruction::Xor:
262 ShiftedFree = true;
263 [[fallthrough]];
264 case Instruction::Sub:
265 case Instruction::Mul:
266 case Instruction::Shl:
267 case Instruction::LShr:
268 case Instruction::AShr:
269 ImmIdx = 1;
270 break;
271 case Instruction::ICmp:
272 UnsignedFree = true;
273 ImmIdx = 1;
274 // Zero comparisons can use record-form instructions.
275 [[fallthrough]];
276 case Instruction::Select:
277 ZeroFree = true;
278 break;
279 case Instruction::PHI:
280 case Instruction::Call:
281 case Instruction::Ret:
282 case Instruction::Load:
283 case Instruction::Store:
284 break;
285 }
286
287 if (ZeroFree && Imm == 0)
288 return TTI::TCC_Free;
289
290 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
291 if (isInt<16>(Imm.getSExtValue()))
292 return TTI::TCC_Free;
293
294 if (RunFree) {
295 if (Imm.getBitWidth() <= 32 &&
296 (isShiftedMask_32(Imm.getZExtValue()) ||
297 isShiftedMask_32(~Imm.getZExtValue())))
298 return TTI::TCC_Free;
299
300 if (ST->isPPC64() &&
301 (isShiftedMask_64(Imm.getZExtValue()) ||
302 isShiftedMask_64(~Imm.getZExtValue())))
303 return TTI::TCC_Free;
304 }
305
306 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
307 return TTI::TCC_Free;
308
309 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
310 return TTI::TCC_Free;
311 }
312
313 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
314}
315
316// Check if the current Type is an MMA vector type. Valid MMA types are
317// v256i1 and v512i1 respectively.
318static bool isMMAType(Type *Ty) {
319 return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
320 (Ty->getPrimitiveSizeInBits() > 128);
321}
322
326 // We already implement getCastInstrCost and getMemoryOpCost where we perform
327 // the vector adjustment there.
328 if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
330
331 if (U->getType()->isVectorTy()) {
332 // Instructions that need to be split should cost more.
333 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(U->getType());
334 return LT.first * BaseT::getInstructionCost(U, Operands, CostKind);
335 }
336
338}
339
341 AssumptionCache &AC,
342 TargetLibraryInfo *LibInfo,
343 HardwareLoopInfo &HWLoopInfo) {
344 const PPCTargetMachine &TM = ST->getTargetMachine();
345 TargetSchedModel SchedModel;
346 SchedModel.init(ST);
347
348 // Do not convert small short loops to CTR loop.
349 unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
350 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
352 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
354 for (BasicBlock *BB : L->blocks())
355 Metrics.analyzeBasicBlock(BB, *this, EphValues);
356 // 6 is an approximate latency for the mtctr instruction.
357 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
358 return false;
359 }
360
361 // Check that there is no hardware loop related intrinsics in the loop.
362 for (auto *BB : L->getBlocks())
363 for (auto &I : *BB)
364 if (auto *Call = dyn_cast<IntrinsicInst>(&I))
365 if (Call->getIntrinsicID() == Intrinsic::set_loop_iterations ||
366 Call->getIntrinsicID() == Intrinsic::loop_decrement)
367 return false;
368
369 SmallVector<BasicBlock*, 4> ExitingBlocks;
370 L->getExitingBlocks(ExitingBlocks);
371
372 // If there is an exit edge known to be frequently taken,
373 // we should not transform this loop.
374 for (auto &BB : ExitingBlocks) {
375 Instruction *TI = BB->getTerminator();
376 if (!TI) continue;
377
378 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
379 uint64_t TrueWeight = 0, FalseWeight = 0;
380 if (!BI->isConditional() ||
381 !extractBranchWeights(*BI, TrueWeight, FalseWeight))
382 continue;
383
384 // If the exit path is more frequent than the loop path,
385 // we return here without further analysis for this loop.
386 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
387 if (( TrueIsExit && FalseWeight < TrueWeight) ||
388 (!TrueIsExit && FalseWeight > TrueWeight))
389 return false;
390 }
391 }
392
393 LLVMContext &C = L->getHeader()->getContext();
394 HWLoopInfo.CountType = TM.isPPC64() ?
396 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
397 return true;
398}
399
403 if (ST->getCPUDirective() == PPC::DIR_A2) {
404 // The A2 is in-order with a deep pipeline, and concatenation unrolling
405 // helps expose latency-hiding opportunities to the instruction scheduler.
406 UP.Partial = UP.Runtime = true;
407
408 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
409 // often outweigh the cost of a division to compute the trip count.
410 UP.AllowExpensiveTripCount = true;
411 }
412
413 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
414}
415
419}
420// This function returns true to allow using coldcc calling convention.
421// Returning true results in coldcc being used for functions which are cold at
422// all call sites when the callers of the functions are not calling any other
423// non coldcc functions.
425 return EnablePPCColdCC;
426}
427
428bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
429 // On the A2, always unroll aggressively.
430 if (ST->getCPUDirective() == PPC::DIR_A2)
431 return true;
432
433 return LoopHasReductions;
434}
435
437PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
439 Options.LoadSizes = {8, 4, 2, 1};
440 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
441 return Options;
442}
443
445 return true;
446}
447
448unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
449 assert(ClassID == GPRRC || ClassID == FPRRC ||
450 ClassID == VRRC || ClassID == VSXRC);
451 if (ST->hasVSX()) {
452 assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
453 return ClassID == VSXRC ? 64 : 32;
454 }
455 assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
456 return 32;
457}
458
460 if (Vector)
461 return ST->hasVSX() ? VSXRC : VRRC;
462 else if (Ty && (Ty->getScalarType()->isFloatTy() ||
463 Ty->getScalarType()->isDoubleTy()))
464 return ST->hasVSX() ? VSXRC : FPRRC;
465 else if (Ty && (Ty->getScalarType()->isFP128Ty() ||
467 return VRRC;
468 else if (Ty && Ty->getScalarType()->isHalfTy())
469 return VSXRC;
470 else
471 return GPRRC;
472}
473
474const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
475
476 switch (ClassID) {
477 default:
478 llvm_unreachable("unknown register class");
479 return "PPC::unknown register class";
480 case GPRRC: return "PPC::GPRRC";
481 case FPRRC: return "PPC::FPRRC";
482 case VRRC: return "PPC::VRRC";
483 case VSXRC: return "PPC::VSXRC";
484 }
485}
486
489 switch (K) {
491 return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
493 return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
495 return TypeSize::getScalable(0);
496 }
497
498 llvm_unreachable("Unsupported register kind");
499}
500
502 // Starting with P7 we have a cache line size of 128.
503 unsigned Directive = ST->getCPUDirective();
504 // Assume that Future CPU has the same cache line size as the others.
508 return 128;
509
510 // On other processors return a default of 64 bytes.
511 return 64;
512}
513
515 return 300;
516}
517
519 unsigned Directive = ST->getCPUDirective();
520 // The 440 has no SIMD support, but floating-point instructions
521 // have a 5-cycle latency, so unroll by 5x for latency hiding.
522 if (Directive == PPC::DIR_440)
523 return 5;
524
525 // The A2 has no SIMD support, but floating-point instructions
526 // have a 6-cycle latency, so unroll by 6x for latency hiding.
527 if (Directive == PPC::DIR_A2)
528 return 6;
529
530 // FIXME: For lack of any better information, do no harm...
532 return 1;
533
534 // For P7 and P8, floating-point instructions have a 6-cycle latency and
535 // there are two execution units, so unroll by 12x for latency hiding.
536 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
537 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
538 // Assume that future is the same as the others.
542 return 12;
543
544 // For most things, modern systems have two execution units (and
545 // out-of-order execution).
546 return 2;
547}
548
549// Returns a cost adjustment factor to adjust the cost of vector instructions
550// on targets which there is overlap between the vector and scalar units,
551// thereby reducing the overall throughput of vector code wrt. scalar code.
552// An invalid instruction cost is returned if the type is an MMA vector type.
554 Type *Ty1, Type *Ty2) {
555 // If the vector type is of an MMA type (v256i1, v512i1), an invalid
556 // instruction cost is returned. This is to signify to other cost computing
557 // functions to return the maximum instruction cost in order to prevent any
558 // opportunities for the optimizer to produce MMA types within the IR.
559 if (isMMAType(Ty1))
561
562 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
563 return InstructionCost(1);
564
565 std::pair<InstructionCost, MVT> LT1 = getTypeLegalizationCost(Ty1);
566 // If type legalization involves splitting the vector, we don't want to
567 // double the cost at every step - only the last step.
568 if (LT1.first != 1 || !LT1.second.isVector())
569 return InstructionCost(1);
570
571 int ISD = TLI->InstructionOpcodeToISD(Opcode);
572 if (TLI->isOperationExpand(ISD, LT1.second))
573 return InstructionCost(1);
574
575 if (Ty2) {
576 std::pair<InstructionCost, MVT> LT2 = getTypeLegalizationCost(Ty2);
577 if (LT2.first != 1 || !LT2.second.isVector())
578 return InstructionCost(1);
579 }
580
581 return InstructionCost(2);
582}
583
585 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
588 const Instruction *CxtI) {
589 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
590
591 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
592 if (!CostFactor.isValid())
594
595 // TODO: Handle more cost kinds.
597 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
598 Op2Info, Args, CxtI);
599
600 // Fallback to the default implementation.
602 Opcode, Ty, CostKind, Op1Info, Op2Info);
603 return Cost * CostFactor;
604}
605
607 ArrayRef<int> Mask,
609 int Index, Type *SubTp,
611
612 InstructionCost CostFactor =
613 vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
614 if (!CostFactor.isValid())
616
617 // Legalize the type.
618 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
619
620 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
621 // (at least in the sense that there need only be one non-loop-invariant
622 // instruction). We need one such shuffle instruction for each actual
623 // register (this is not true for arbitrary shuffles, but is true for the
624 // structured types of shuffles covered by TTI::ShuffleKind).
625 return LT.first * CostFactor;
626}
627
630 const Instruction *I) {
632 return Opcode == Instruction::PHI ? 0 : 1;
633 // Branches are assumed to be predicted.
634 return 0;
635}
636
638 Type *Src,
641 const Instruction *I) {
642 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
643
644 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
645 if (!CostFactor.isValid())
647
649 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
650 Cost *= CostFactor;
651 // TODO: Allow non-throughput costs that aren't binary.
653 return Cost == 0 ? 0 : 1;
654 return Cost;
655}
656
658 Type *CondTy,
659 CmpInst::Predicate VecPred,
661 const Instruction *I) {
662 InstructionCost CostFactor =
663 vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
664 if (!CostFactor.isValid())
666
668 BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
669 // TODO: Handle other cost kinds.
671 return Cost;
672 return Cost * CostFactor;
673}
674
677 unsigned Index, Value *Op0,
678 Value *Op1) {
679 assert(Val->isVectorTy() && "This must be a vector type");
680
681 int ISD = TLI->InstructionOpcodeToISD(Opcode);
682 assert(ISD && "Invalid opcode");
683
684 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
685 if (!CostFactor.isValid())
687
689 BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
690 Cost *= CostFactor;
691
692 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
693 // Double-precision scalars are already located in index #0 (or #1 if LE).
694 if (ISD == ISD::EXTRACT_VECTOR_ELT &&
695 Index == (ST->isLittleEndian() ? 1 : 0))
696 return 0;
697
698 return Cost;
699
700 } else if (Val->getScalarType()->isIntegerTy()) {
701 unsigned EltSize = Val->getScalarSizeInBits();
702 // Computing on 1 bit values requires extra mask or compare operations.
703 unsigned MaskCostForOneBitSize = (VecMaskCost && EltSize == 1) ? 1 : 0;
704 // Computing on non const index requires extra mask or compare operations.
705 unsigned MaskCostForIdx = (Index != -1U) ? 0 : 1;
706 if (ST->hasP9Altivec()) {
707 // P10 has vxform insert which can handle non const index. The
708 // MaskCostForIdx is for masking the index.
709 // P9 has insert for const index. A move-to VSR and a permute/insert.
710 // Assume vector operation cost for both (cost will be 2x on P9).
711 if (ISD == ISD::INSERT_VECTOR_ELT) {
712 if (ST->hasP10Vector())
713 return CostFactor + MaskCostForIdx;
714 else if (Index != -1U)
715 return 2 * CostFactor;
716 } else if (ISD == ISD::EXTRACT_VECTOR_ELT) {
717 // It's an extract. Maybe we can do a cheap move-from VSR.
718 unsigned EltSize = Val->getScalarSizeInBits();
719 if (EltSize == 64) {
720 // FIXME: no need to worry about endian, P9 has both mfvsrd/mfvsrld.
721 unsigned MfvsrdIndex = ST->isLittleEndian() ? 1 : 0;
722 if (Index == MfvsrdIndex)
723 return 1;
724 } else if (EltSize == 32) {
725 unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
726 if (Index == MfvsrwzIndex)
727 return 1;
728
729 // For other indexs like non const, P9 has vxform extract. The
730 // MaskCostForIdx is for masking the index.
731 return CostFactor + MaskCostForIdx;
732 }
733
734 // We need a vector extract (or mfvsrld). Assume vector operation cost.
735 // The cost of the load constant for a vector extract is disregarded
736 // (invariant, easily schedulable).
737 return CostFactor + MaskCostForOneBitSize + MaskCostForIdx;
738 }
739 } else if (ST->hasDirectMove() && Index != -1U) {
740 // Assume permute has standard cost.
741 // Assume move-to/move-from VSR have 2x standard cost.
742 if (ISD == ISD::INSERT_VECTOR_ELT)
743 return 3;
744 return 3 + MaskCostForOneBitSize;
745 }
746 }
747
748 // Estimated cost of a load-hit-store delay. This was obtained
749 // experimentally as a minimum needed to prevent unprofitable
750 // vectorization for the paq8p benchmark. It may need to be
751 // raised further if other unprofitable cases remain.
752 unsigned LHSPenalty = 2;
753 if (ISD == ISD::INSERT_VECTOR_ELT)
754 LHSPenalty += 7;
755
756 // Vector element insert/extract with Altivec is very expensive,
757 // because they require store and reload with the attendant
758 // processor stall for load-hit-store. Until VSX is available,
759 // these need to be estimated as very costly.
760 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
762 return LHSPenalty + Cost;
763
764 return Cost;
765}
766
768 MaybeAlign Alignment,
769 unsigned AddressSpace,
772 const Instruction *I) {
773
774 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
775 if (!CostFactor.isValid())
777
778 if (TLI->getValueType(DL, Src, true) == MVT::Other)
779 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
780 CostKind);
781 // Legalize the type.
782 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
783 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
784 "Invalid Opcode");
785
787 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
788 // TODO: Handle other cost kinds.
790 return Cost;
791
792 Cost *= CostFactor;
793
794 bool IsAltivecType = ST->hasAltivec() &&
795 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
796 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
797 bool IsVSXType = ST->hasVSX() &&
798 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
799
800 // VSX has 32b/64b load instructions. Legalization can handle loading of
801 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
802 // PPCTargetLowering can't compute the cost appropriately. So here we
803 // explicitly check this case. There are also corresponding store
804 // instructions.
805 unsigned MemBytes = Src->getPrimitiveSizeInBits();
806 if (ST->hasVSX() && IsAltivecType &&
807 (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
808 return 1;
809
810 // Aligned loads and stores are easy.
811 unsigned SrcBytes = LT.second.getStoreSize();
812 if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
813 return Cost;
814
815 // If we can use the permutation-based load sequence, then this is also
816 // relatively cheap (not counting loop-invariant instructions): one load plus
817 // one permute (the last load in a series has extra cost, but we're
818 // neglecting that here). Note that on the P7, we could do unaligned loads
819 // for Altivec types using the VSX instructions, but that's more expensive
820 // than using the permutation-based load sequence. On the P8, that's no
821 // longer true.
822 if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
823 *Alignment >= LT.second.getScalarType().getStoreSize())
824 return Cost + LT.first; // Add the cost of the permutations.
825
826 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
827 // P7, unaligned vector loads are more expensive than the permutation-based
828 // load sequence, so that might be used instead, but regardless, the net cost
829 // is about the same (not counting loop-invariant instructions).
830 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
831 return Cost;
832
833 // Newer PPC supports unaligned memory access.
834 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
835 return Cost;
836
837 // PPC in general does not support unaligned loads and stores. They'll need
838 // to be decomposed based on the alignment factor.
839
840 // Add the cost of each scalar load or store.
841 assert(Alignment);
842 Cost += LT.first * ((SrcBytes / Alignment->value()) - 1);
843
844 // For a vector type, there is also scalarization overhead (only for
845 // stores, loads are expanded using the vector-load + permutation sequence,
846 // which is much less expensive).
847 if (Src->isVectorTy() && Opcode == Instruction::Store)
848 for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
849 ++i)
850 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i,
851 nullptr, nullptr);
852
853 return Cost;
854}
855
857 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
858 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
859 bool UseMaskForCond, bool UseMaskForGaps) {
860 InstructionCost CostFactor =
861 vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
862 if (!CostFactor.isValid())
864
865 if (UseMaskForCond || UseMaskForGaps)
866 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
867 Alignment, AddressSpace, CostKind,
868 UseMaskForCond, UseMaskForGaps);
869
870 assert(isa<VectorType>(VecTy) &&
871 "Expect a vector type for interleaved memory op");
872
873 // Legalize the type.
874 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
875
876 // Firstly, the cost of load/store operation.
877 InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment),
879
880 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
881 // (at least in the sense that there need only be one non-loop-invariant
882 // instruction). For each result vector, we need one shuffle per incoming
883 // vector (except that the first shuffle can take two incoming vectors
884 // because it does not need to take itself).
885 Cost += Factor*(LT.first-1);
886
887 return Cost;
888}
889
894}
895
897 const Function *Callee,
898 const ArrayRef<Type *> &Types) const {
899
900 // We need to ensure that argument promotion does not
901 // attempt to promote pointers to MMA types (__vector_pair
902 // and __vector_quad) since these types explicitly cannot be
903 // passed as arguments. Both of these types are larger than
904 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
905 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
906 return false;
907
908 return llvm::none_of(Types, [](Type *Ty) {
909 if (Ty->isSized())
910 return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
911 return false;
912 });
913}
914
916 LoopInfo *LI, DominatorTree *DT,
917 AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
918 // Process nested loops first.
919 for (Loop *I : *L)
920 if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
921 return false; // Stop search.
922
923 HardwareLoopInfo HWLoopInfo(L);
924
925 if (!HWLoopInfo.canAnalyze(*LI))
926 return false;
927
928 if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
929 return false;
930
931 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
932 return false;
933
934 *BI = HWLoopInfo.ExitBranch;
935 return true;
936}
937
940 // PowerPC default behaviour here is "instruction number 1st priority".
941 // If LsrNoInsnsCost is set, call default implementation.
942 if (!LsrNoInsnsCost)
943 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
944 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
945 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
946 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
947 else
949}
950
952 return false;
953}
954
956 const PPCTargetMachine &TM = ST->getTargetMachine();
957 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
958 if (!TM.isELFv2ABI())
959 return false;
961}
962
964 MemIntrinsicInfo &Info) {
965 switch (Inst->getIntrinsicID()) {
966 case Intrinsic::ppc_altivec_lvx:
967 case Intrinsic::ppc_altivec_lvxl:
968 case Intrinsic::ppc_altivec_lvebx:
969 case Intrinsic::ppc_altivec_lvehx:
970 case Intrinsic::ppc_altivec_lvewx:
971 case Intrinsic::ppc_vsx_lxvd2x:
972 case Intrinsic::ppc_vsx_lxvw4x:
973 case Intrinsic::ppc_vsx_lxvd2x_be:
974 case Intrinsic::ppc_vsx_lxvw4x_be:
975 case Intrinsic::ppc_vsx_lxvl:
976 case Intrinsic::ppc_vsx_lxvll:
977 case Intrinsic::ppc_vsx_lxvp: {
978 Info.PtrVal = Inst->getArgOperand(0);
979 Info.ReadMem = true;
980 Info.WriteMem = false;
981 return true;
982 }
983 case Intrinsic::ppc_altivec_stvx:
984 case Intrinsic::ppc_altivec_stvxl:
985 case Intrinsic::ppc_altivec_stvebx:
986 case Intrinsic::ppc_altivec_stvehx:
987 case Intrinsic::ppc_altivec_stvewx:
988 case Intrinsic::ppc_vsx_stxvd2x:
989 case Intrinsic::ppc_vsx_stxvw4x:
990 case Intrinsic::ppc_vsx_stxvd2x_be:
991 case Intrinsic::ppc_vsx_stxvw4x_be:
992 case Intrinsic::ppc_vsx_stxvl:
993 case Intrinsic::ppc_vsx_stxvll:
994 case Intrinsic::ppc_vsx_stxvp: {
995 Info.PtrVal = Inst->getArgOperand(1);
996 Info.ReadMem = false;
997 Info.WriteMem = true;
998 return true;
999 }
1000 case Intrinsic::ppc_stbcx:
1001 case Intrinsic::ppc_sthcx:
1002 case Intrinsic::ppc_stdcx:
1003 case Intrinsic::ppc_stwcx: {
1004 Info.PtrVal = Inst->getArgOperand(0);
1005 Info.ReadMem = false;
1006 Info.WriteMem = true;
1007 return true;
1008 }
1009 default:
1010 break;
1011 }
1012
1013 return false;
1014}
1015
1016bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
1017 Align Alignment) const {
1018 // Only load and stores instructions can have variable vector length on Power.
1019 if (Opcode != Instruction::Load && Opcode != Instruction::Store)
1020 return false;
1021 // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1022 // therefore cannot be used in 32-bit mode.
1023 if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
1024 return false;
1025 if (isa<FixedVectorType>(DataType)) {
1026 unsigned VecWidth = DataType->getPrimitiveSizeInBits();
1027 return VecWidth == 128;
1028 }
1029 Type *ScalarTy = DataType->getScalarType();
1030
1031 if (ScalarTy->isPointerTy())
1032 return true;
1033
1034 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
1035 return true;
1036
1037 if (!ScalarTy->isIntegerTy())
1038 return false;
1039
1040 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
1041 return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
1042}
1043
1045 Align Alignment,
1046 unsigned AddressSpace,
1048 const Instruction *I) {
1049 InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
1051 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1052 return Cost;
1053 // TODO: Handle other cost kinds.
1055 return Cost;
1056
1057 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1058 "Invalid Opcode");
1059
1060 auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
1061 assert(SrcVTy && "Expected a vector type for VP memory operations");
1062
1063 if (hasActiveVectorLength(Opcode, Src, Alignment)) {
1064 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
1065
1066 InstructionCost CostFactor =
1067 vectorCostAdjustmentFactor(Opcode, Src, nullptr);
1068 if (!CostFactor.isValid())
1069 return InstructionCost::getMax();
1070
1071 InstructionCost Cost = LT.first * CostFactor;
1072 assert(Cost.isValid() && "Expected valid cost");
1073
1074 // On P9 but not on P10, if the op is misaligned then it will cause a
1075 // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1076 // ones.
1077 const Align DesiredAlignment(16);
1078 if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
1079 return Cost;
1080
1081 // Since alignment may be under estimated, we try to compute the probability
1082 // that the actual address is aligned to the desired boundary. For example
1083 // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1084 // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1085 // aligned.
1086 float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
1087 float MisalignmentProb = 1.0 - AlignmentProb;
1088 return (MisalignmentProb * P9PipelineFlushEstimate) +
1089 (AlignmentProb * *Cost.getValue());
1090 }
1091
1092 // Usually we should not get to this point, but the following is an attempt to
1093 // model the cost of legalization. Currently we can only lower intrinsics with
1094 // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1095 return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1096}
1097
1099 return TLI->supportsTailCallFor(CB);
1100}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
Machine Trace Metrics
static cl::opt< bool > VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden)
static cl::opt< bool > DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden)
static cl::opt< unsigned > SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, cl::desc("Loops with a constant trip count smaller than " "this value will not use the count register."))
static bool isMMAType(Type *Ty)
static cl::opt< bool > EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions"))
static cl::opt< bool > LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false), cl::desc("Do not add instruction count to lsr cost model"))
This file a TargetTransformInfo::Concept conforming object specific to the PPC target machine.
if(VerifyEach)
const char LLVMTargetMachineRef TM
This file contains the declarations for profiling metadata utility functions.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:576
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:885
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:648
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:849
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1259
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1426
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", Instruction *InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:780
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
This is an important base class in LLVM.
Definition: Constant.h:41
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2455
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2443
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:480
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2110
The core instruction combiner logic.
Definition: InstCombiner.h:47
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:340
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:339
BuilderTy & Builder
Definition: InstCombiner.h:60
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:337
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:178
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
The optimization diagnostic interface.
bool isPPC64() const
isPPC64 - Return true if we are generating code for 64-bit pointer mode.
unsigned getCPUDirective() const
getCPUDirective - Returns the -m directive specified for the cpu.
Definition: PPCSubtarget.h:134
POPCNTDKind hasPOPCNTD() const
Definition: PPCSubtarget.h:206
bool isLittleEndian() const
Definition: PPCSubtarget.h:181
const PPCTargetMachine & getTargetMachine() const
Definition: PPCSubtarget.h:155
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
const char * getRegisterClassName(unsigned ClassID) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useColdCCForColdCall(Function &F)
InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1, Type *Ty2)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool supportsTailCallFor(const CallBase *CB) const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getCacheLineSize() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
unsigned getMaxInterleaveFactor(ElementCount VF)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool enableAggressiveInterleaving(bool LoopHasReductions)
unsigned getPrefetchDistance() const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
unsigned getNumberOfRegisters(unsigned ClassID) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
bool shouldBuildRelLookupTables() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo)
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool supportsTailCallFor(const CallBase *CB) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Is unaligned memory access allowed for the given type, and is it fast relative to software emulation.
Common code between 32-bit and 64-bit PowerPC targets.
The main scalar evolution driver.
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:302
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
Provide an instruction scheduling machine model to CodeGen passes.
unsigned getIssueWidth() const
Maximum number of micro-ops that may be scheduled per cycle.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
CastContextHint
Represents a hint about the context in which a cast is used.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:332
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:335
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1724
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:535
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:524
@ DIR_PWR_FUTURE
Definition: PPCSubtarget.h:64
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:252
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:258
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1536
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1745
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:31
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...