LLVM 20.0.0git
PPCTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
16#include "llvm/IR/IntrinsicsPowerPC.h"
21#include <optional>
22
23using namespace llvm;
24
25#define DEBUG_TYPE "ppctti"
26
27static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
28cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
29
30static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
31cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
32
33static cl::opt<bool>
34EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
35 cl::desc("Enable using coldcc calling conv for cold "
36 "internal functions"));
37
38static cl::opt<bool>
39LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
40 cl::desc("Do not add instruction count to lsr cost model"));
41
42// The latency of mtctr is only justified if there are more than 4
43// comparisons that will be removed as a result.
45SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
46 cl::desc("Loops with a constant trip count smaller than "
47 "this value will not use the count register."));
48
49//===----------------------------------------------------------------------===//
50//
51// PPC cost model.
52//
53//===----------------------------------------------------------------------===//
54
57 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
58 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
61 return TTI::PSK_Software;
62}
63
64std::optional<Instruction *>
66 Intrinsic::ID IID = II.getIntrinsicID();
67 switch (IID) {
68 default:
69 break;
70 case Intrinsic::ppc_altivec_lvx:
71 case Intrinsic::ppc_altivec_lvxl:
72 // Turn PPC lvx -> load if the pointer is known aligned.
74 II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
75 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
76 Value *Ptr = II.getArgOperand(0);
77 return new LoadInst(II.getType(), Ptr, "", false, Align(16));
78 }
79 break;
80 case Intrinsic::ppc_vsx_lxvw4x:
81 case Intrinsic::ppc_vsx_lxvd2x: {
82 // Turn PPC VSX loads into normal loads.
83 Value *Ptr = II.getArgOperand(0);
84 return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
85 }
86 case Intrinsic::ppc_altivec_stvx:
87 case Intrinsic::ppc_altivec_stvxl:
88 // Turn stvx -> store if the pointer is known aligned.
90 II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
91 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
92 Value *Ptr = II.getArgOperand(1);
93 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
94 }
95 break;
96 case Intrinsic::ppc_vsx_stxvw4x:
97 case Intrinsic::ppc_vsx_stxvd2x: {
98 // Turn PPC VSX stores into normal stores.
99 Value *Ptr = II.getArgOperand(1);
100 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
101 }
102 case Intrinsic::ppc_altivec_vperm:
103 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
104 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
105 // a vectorshuffle for little endian, we must undo the transformation
106 // performed on vec_perm in altivec.h. That is, we must complement
107 // the permutation mask with respect to 31 and reverse the order of
108 // V1 and V2.
109 if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
110 assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
111 "Bad type for intrinsic!");
112
113 // Check that all of the elements are integer constants or undefs.
114 bool AllEltsOk = true;
115 for (unsigned i = 0; i != 16; ++i) {
116 Constant *Elt = Mask->getAggregateElement(i);
117 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
118 AllEltsOk = false;
119 break;
120 }
121 }
122
123 if (AllEltsOk) {
124 // Cast the input vectors to byte vectors.
125 Value *Op0 =
126 IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
127 Value *Op1 =
128 IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
129 Value *Result = UndefValue::get(Op0->getType());
130
131 // Only extract each element once.
132 Value *ExtractedElts[32];
133 memset(ExtractedElts, 0, sizeof(ExtractedElts));
134
135 for (unsigned i = 0; i != 16; ++i) {
136 if (isa<UndefValue>(Mask->getAggregateElement(i)))
137 continue;
138 unsigned Idx =
139 cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
140 Idx &= 31; // Match the hardware behavior.
141 if (DL.isLittleEndian())
142 Idx = 31 - Idx;
143
144 if (!ExtractedElts[Idx]) {
145 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
146 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
147 ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
148 Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
149 }
150
151 // Insert this value into the result vector.
152 Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
153 IC.Builder.getInt32(i));
154 }
155 return CastInst::Create(Instruction::BitCast, Result, II.getType());
156 }
157 }
158 break;
159 }
160 return std::nullopt;
161}
162
166 return BaseT::getIntImmCost(Imm, Ty, CostKind);
167
168 assert(Ty->isIntegerTy());
169
170 unsigned BitSize = Ty->getPrimitiveSizeInBits();
171 if (BitSize == 0)
172 return ~0U;
173
174 if (Imm == 0)
175 return TTI::TCC_Free;
176
177 if (Imm.getBitWidth() <= 64) {
178 if (isInt<16>(Imm.getSExtValue()))
179 return TTI::TCC_Basic;
180
181 if (isInt<32>(Imm.getSExtValue())) {
182 // A constant that can be materialized using lis.
183 if ((Imm.getZExtValue() & 0xFFFF) == 0)
184 return TTI::TCC_Basic;
185
186 return 2 * TTI::TCC_Basic;
187 }
188 }
189
190 return 4 * TTI::TCC_Basic;
191}
192
194 const APInt &Imm, Type *Ty,
197 return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
198
199 assert(Ty->isIntegerTy());
200
201 unsigned BitSize = Ty->getPrimitiveSizeInBits();
202 if (BitSize == 0)
203 return ~0U;
204
205 switch (IID) {
206 default:
207 return TTI::TCC_Free;
208 case Intrinsic::sadd_with_overflow:
209 case Intrinsic::uadd_with_overflow:
210 case Intrinsic::ssub_with_overflow:
211 case Intrinsic::usub_with_overflow:
212 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
213 return TTI::TCC_Free;
214 break;
215 case Intrinsic::experimental_stackmap:
216 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
217 return TTI::TCC_Free;
218 break;
219 case Intrinsic::experimental_patchpoint_void:
220 case Intrinsic::experimental_patchpoint:
221 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
222 return TTI::TCC_Free;
223 break;
224 }
225 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
226}
227
229 const APInt &Imm, Type *Ty,
231 Instruction *Inst) {
233 return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
234
235 assert(Ty->isIntegerTy());
236
237 unsigned BitSize = Ty->getPrimitiveSizeInBits();
238 if (BitSize == 0)
239 return ~0U;
240
241 unsigned ImmIdx = ~0U;
242 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
243 ZeroFree = false;
244 switch (Opcode) {
245 default:
246 return TTI::TCC_Free;
247 case Instruction::GetElementPtr:
248 // Always hoist the base address of a GetElementPtr. This prevents the
249 // creation of new constants for every base constant that gets constant
250 // folded with the offset.
251 if (Idx == 0)
252 return 2 * TTI::TCC_Basic;
253 return TTI::TCC_Free;
254 case Instruction::And:
255 RunFree = true; // (for the rotate-and-mask instructions)
256 [[fallthrough]];
257 case Instruction::Add:
258 case Instruction::Or:
259 case Instruction::Xor:
260 ShiftedFree = true;
261 [[fallthrough]];
262 case Instruction::Sub:
263 case Instruction::Mul:
264 case Instruction::Shl:
265 case Instruction::LShr:
266 case Instruction::AShr:
267 ImmIdx = 1;
268 break;
269 case Instruction::ICmp:
270 UnsignedFree = true;
271 ImmIdx = 1;
272 // Zero comparisons can use record-form instructions.
273 [[fallthrough]];
274 case Instruction::Select:
275 ZeroFree = true;
276 break;
277 case Instruction::PHI:
278 case Instruction::Call:
279 case Instruction::Ret:
280 case Instruction::Load:
281 case Instruction::Store:
282 break;
283 }
284
285 if (ZeroFree && Imm == 0)
286 return TTI::TCC_Free;
287
288 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
289 if (isInt<16>(Imm.getSExtValue()))
290 return TTI::TCC_Free;
291
292 if (RunFree) {
293 if (Imm.getBitWidth() <= 32 &&
294 (isShiftedMask_32(Imm.getZExtValue()) ||
295 isShiftedMask_32(~Imm.getZExtValue())))
296 return TTI::TCC_Free;
297
298 if (ST->isPPC64() &&
299 (isShiftedMask_64(Imm.getZExtValue()) ||
300 isShiftedMask_64(~Imm.getZExtValue())))
301 return TTI::TCC_Free;
302 }
303
304 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
305 return TTI::TCC_Free;
306
307 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
308 return TTI::TCC_Free;
309 }
310
311 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
312}
313
314// Check if the current Type is an MMA vector type. Valid MMA types are
315// v256i1 and v512i1 respectively.
316static bool isMMAType(Type *Ty) {
317 return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
318 (Ty->getPrimitiveSizeInBits() > 128);
319}
320
324 // We already implement getCastInstrCost and getMemoryOpCost where we perform
325 // the vector adjustment there.
326 if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
328
329 if (U->getType()->isVectorTy()) {
330 // Instructions that need to be split should cost more.
331 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(U->getType());
332 return LT.first * BaseT::getInstructionCost(U, Operands, CostKind);
333 }
334
336}
337
339 AssumptionCache &AC,
340 TargetLibraryInfo *LibInfo,
341 HardwareLoopInfo &HWLoopInfo) {
342 const PPCTargetMachine &TM = ST->getTargetMachine();
343 TargetSchedModel SchedModel;
344 SchedModel.init(ST);
345
346 // Do not convert small short loops to CTR loop.
347 unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
348 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
350 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
352 for (BasicBlock *BB : L->blocks())
353 Metrics.analyzeBasicBlock(BB, *this, EphValues);
354 // 6 is an approximate latency for the mtctr instruction.
355 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
356 return false;
357 }
358
359 // Check that there is no hardware loop related intrinsics in the loop.
360 for (auto *BB : L->getBlocks())
361 for (auto &I : *BB)
362 if (auto *Call = dyn_cast<IntrinsicInst>(&I))
363 if (Call->getIntrinsicID() == Intrinsic::set_loop_iterations ||
364 Call->getIntrinsicID() == Intrinsic::loop_decrement)
365 return false;
366
367 SmallVector<BasicBlock*, 4> ExitingBlocks;
368 L->getExitingBlocks(ExitingBlocks);
369
370 // If there is an exit edge known to be frequently taken,
371 // we should not transform this loop.
372 for (auto &BB : ExitingBlocks) {
373 Instruction *TI = BB->getTerminator();
374 if (!TI) continue;
375
376 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
377 uint64_t TrueWeight = 0, FalseWeight = 0;
378 if (!BI->isConditional() ||
379 !extractBranchWeights(*BI, TrueWeight, FalseWeight))
380 continue;
381
382 // If the exit path is more frequent than the loop path,
383 // we return here without further analysis for this loop.
384 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
385 if (( TrueIsExit && FalseWeight < TrueWeight) ||
386 (!TrueIsExit && FalseWeight > TrueWeight))
387 return false;
388 }
389 }
390
391 LLVMContext &C = L->getHeader()->getContext();
392 HWLoopInfo.CountType = TM.isPPC64() ?
394 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
395 return true;
396}
397
401 if (ST->getCPUDirective() == PPC::DIR_A2) {
402 // The A2 is in-order with a deep pipeline, and concatenation unrolling
403 // helps expose latency-hiding opportunities to the instruction scheduler.
404 UP.Partial = UP.Runtime = true;
405
406 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
407 // often outweigh the cost of a division to compute the trip count.
408 UP.AllowExpensiveTripCount = true;
409 }
410
411 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
412}
413
417}
418// This function returns true to allow using coldcc calling convention.
419// Returning true results in coldcc being used for functions which are cold at
420// all call sites when the callers of the functions are not calling any other
421// non coldcc functions.
423 return EnablePPCColdCC;
424}
425
426bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
427 // On the A2, always unroll aggressively.
428 if (ST->getCPUDirective() == PPC::DIR_A2)
429 return true;
430
431 return LoopHasReductions;
432}
433
435PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
437 Options.LoadSizes = {8, 4, 2, 1};
438 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
439 return Options;
440}
441
443 return true;
444}
445
446unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
447 assert(ClassID == GPRRC || ClassID == FPRRC ||
448 ClassID == VRRC || ClassID == VSXRC);
449 if (ST->hasVSX()) {
450 assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
451 return ClassID == VSXRC ? 64 : 32;
452 }
453 assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
454 return 32;
455}
456
458 if (Vector)
459 return ST->hasVSX() ? VSXRC : VRRC;
460 else if (Ty && (Ty->getScalarType()->isFloatTy() ||
461 Ty->getScalarType()->isDoubleTy()))
462 return ST->hasVSX() ? VSXRC : FPRRC;
463 else if (Ty && (Ty->getScalarType()->isFP128Ty() ||
465 return VRRC;
466 else if (Ty && Ty->getScalarType()->isHalfTy())
467 return VSXRC;
468 else
469 return GPRRC;
470}
471
472const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
473
474 switch (ClassID) {
475 default:
476 llvm_unreachable("unknown register class");
477 return "PPC::unknown register class";
478 case GPRRC: return "PPC::GPRRC";
479 case FPRRC: return "PPC::FPRRC";
480 case VRRC: return "PPC::VRRC";
481 case VSXRC: return "PPC::VSXRC";
482 }
483}
484
487 switch (K) {
489 return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
491 return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
493 return TypeSize::getScalable(0);
494 }
495
496 llvm_unreachable("Unsupported register kind");
497}
498
500 // Starting with P7 we have a cache line size of 128.
501 unsigned Directive = ST->getCPUDirective();
502 // Assume that Future CPU has the same cache line size as the others.
506 return 128;
507
508 // On other processors return a default of 64 bytes.
509 return 64;
510}
511
513 return 300;
514}
515
517 unsigned Directive = ST->getCPUDirective();
518 // The 440 has no SIMD support, but floating-point instructions
519 // have a 5-cycle latency, so unroll by 5x for latency hiding.
520 if (Directive == PPC::DIR_440)
521 return 5;
522
523 // The A2 has no SIMD support, but floating-point instructions
524 // have a 6-cycle latency, so unroll by 6x for latency hiding.
525 if (Directive == PPC::DIR_A2)
526 return 6;
527
528 // FIXME: For lack of any better information, do no harm...
530 return 1;
531
532 // For P7 and P8, floating-point instructions have a 6-cycle latency and
533 // there are two execution units, so unroll by 12x for latency hiding.
534 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
535 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
536 // Assume that future is the same as the others.
540 return 12;
541
542 // For most things, modern systems have two execution units (and
543 // out-of-order execution).
544 return 2;
545}
546
547// Returns a cost adjustment factor to adjust the cost of vector instructions
548// on targets which there is overlap between the vector and scalar units,
549// thereby reducing the overall throughput of vector code wrt. scalar code.
550// An invalid instruction cost is returned if the type is an MMA vector type.
552 Type *Ty1, Type *Ty2) {
553 // If the vector type is of an MMA type (v256i1, v512i1), an invalid
554 // instruction cost is returned. This is to signify to other cost computing
555 // functions to return the maximum instruction cost in order to prevent any
556 // opportunities for the optimizer to produce MMA types within the IR.
557 if (isMMAType(Ty1))
559
560 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
561 return InstructionCost(1);
562
563 std::pair<InstructionCost, MVT> LT1 = getTypeLegalizationCost(Ty1);
564 // If type legalization involves splitting the vector, we don't want to
565 // double the cost at every step - only the last step.
566 if (LT1.first != 1 || !LT1.second.isVector())
567 return InstructionCost(1);
568
569 int ISD = TLI->InstructionOpcodeToISD(Opcode);
570 if (TLI->isOperationExpand(ISD, LT1.second))
571 return InstructionCost(1);
572
573 if (Ty2) {
574 std::pair<InstructionCost, MVT> LT2 = getTypeLegalizationCost(Ty2);
575 if (LT2.first != 1 || !LT2.second.isVector())
576 return InstructionCost(1);
577 }
578
579 return InstructionCost(2);
580}
581
583 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
586 const Instruction *CxtI) {
587 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
588
589 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
590 if (!CostFactor.isValid())
592
593 // TODO: Handle more cost kinds.
595 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
596 Op2Info, Args, CxtI);
597
598 // Fallback to the default implementation.
600 Opcode, Ty, CostKind, Op1Info, Op2Info);
601 return Cost * CostFactor;
602}
603
605 ArrayRef<int> Mask,
607 int Index, Type *SubTp,
609 const Instruction *CxtI) {
610
611 InstructionCost CostFactor =
612 vectorCostAdjustmentFactor(Instruction::ShuffleVector, Tp, nullptr);
613 if (!CostFactor.isValid())
615
616 // Legalize the type.
617 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
618
619 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
620 // (at least in the sense that there need only be one non-loop-invariant
621 // instruction). We need one such shuffle instruction for each actual
622 // register (this is not true for arbitrary shuffles, but is true for the
623 // structured types of shuffles covered by TTI::ShuffleKind).
624 return LT.first * CostFactor;
625}
626
629 const Instruction *I) {
631 return Opcode == Instruction::PHI ? 0 : 1;
632 // Branches are assumed to be predicted.
633 return 0;
634}
635
637 Type *Src,
640 const Instruction *I) {
641 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
642
643 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
644 if (!CostFactor.isValid())
646
648 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
649 Cost *= CostFactor;
650 // TODO: Allow non-throughput costs that aren't binary.
652 return Cost == 0 ? 0 : 1;
653 return Cost;
654}
655
657 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
659 TTI::OperandValueInfo Op2Info, const Instruction *I) {
660 InstructionCost CostFactor =
661 vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
662 if (!CostFactor.isValid())
664
666 Opcode, ValTy, CondTy, VecPred, CostKind, Op1Info, Op2Info, I);
667 // TODO: Handle other cost kinds.
669 return Cost;
670 return Cost * CostFactor;
671}
672
675 unsigned Index, Value *Op0,
676 Value *Op1) {
677 assert(Val->isVectorTy() && "This must be a vector type");
678
679 int ISD = TLI->InstructionOpcodeToISD(Opcode);
680 assert(ISD && "Invalid opcode");
681
682 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
683 if (!CostFactor.isValid())
685
687 BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
688 Cost *= CostFactor;
689
690 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
691 // Double-precision scalars are already located in index #0 (or #1 if LE).
692 if (ISD == ISD::EXTRACT_VECTOR_ELT &&
693 Index == (ST->isLittleEndian() ? 1 : 0))
694 return 0;
695
696 return Cost;
697
698 } else if (Val->getScalarType()->isIntegerTy()) {
699 unsigned EltSize = Val->getScalarSizeInBits();
700 // Computing on 1 bit values requires extra mask or compare operations.
701 unsigned MaskCostForOneBitSize = (VecMaskCost && EltSize == 1) ? 1 : 0;
702 // Computing on non const index requires extra mask or compare operations.
703 unsigned MaskCostForIdx = (Index != -1U) ? 0 : 1;
704 if (ST->hasP9Altivec()) {
705 // P10 has vxform insert which can handle non const index. The
706 // MaskCostForIdx is for masking the index.
707 // P9 has insert for const index. A move-to VSR and a permute/insert.
708 // Assume vector operation cost for both (cost will be 2x on P9).
709 if (ISD == ISD::INSERT_VECTOR_ELT) {
710 if (ST->hasP10Vector())
711 return CostFactor + MaskCostForIdx;
712 else if (Index != -1U)
713 return 2 * CostFactor;
714 } else if (ISD == ISD::EXTRACT_VECTOR_ELT) {
715 // It's an extract. Maybe we can do a cheap move-from VSR.
716 unsigned EltSize = Val->getScalarSizeInBits();
717 // P9 has both mfvsrd and mfvsrld for 64 bit integer.
718 if (EltSize == 64 && Index != -1U)
719 return 1;
720 else if (EltSize == 32) {
721 unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
722 if (Index == MfvsrwzIndex)
723 return 1;
724
725 // For other indexs like non const, P9 has vxform extract. The
726 // MaskCostForIdx is for masking the index.
727 return CostFactor + MaskCostForIdx;
728 }
729
730 // We need a vector extract (or mfvsrld). Assume vector operation cost.
731 // The cost of the load constant for a vector extract is disregarded
732 // (invariant, easily schedulable).
733 return CostFactor + MaskCostForOneBitSize + MaskCostForIdx;
734 }
735 } else if (ST->hasDirectMove() && Index != -1U) {
736 // Assume permute has standard cost.
737 // Assume move-to/move-from VSR have 2x standard cost.
738 if (ISD == ISD::INSERT_VECTOR_ELT)
739 return 3;
740 return 3 + MaskCostForOneBitSize;
741 }
742 }
743
744 // Estimated cost of a load-hit-store delay. This was obtained
745 // experimentally as a minimum needed to prevent unprofitable
746 // vectorization for the paq8p benchmark. It may need to be
747 // raised further if other unprofitable cases remain.
748 unsigned LHSPenalty = 2;
749 if (ISD == ISD::INSERT_VECTOR_ELT)
750 LHSPenalty += 7;
751
752 // Vector element insert/extract with Altivec is very expensive,
753 // because they require store and reload with the attendant
754 // processor stall for load-hit-store. Until VSX is available,
755 // these need to be estimated as very costly.
756 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
758 return LHSPenalty + Cost;
759
760 return Cost;
761}
762
764 MaybeAlign Alignment,
765 unsigned AddressSpace,
768 const Instruction *I) {
769
770 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
771 if (!CostFactor.isValid())
773
774 if (TLI->getValueType(DL, Src, true) == MVT::Other)
775 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
776 CostKind);
777 // Legalize the type.
778 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
779 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
780 "Invalid Opcode");
781
783 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
784 // TODO: Handle other cost kinds.
786 return Cost;
787
788 Cost *= CostFactor;
789
790 bool IsAltivecType = ST->hasAltivec() &&
791 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
792 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
793 bool IsVSXType = ST->hasVSX() &&
794 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
795
796 // VSX has 32b/64b load instructions. Legalization can handle loading of
797 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
798 // PPCTargetLowering can't compute the cost appropriately. So here we
799 // explicitly check this case. There are also corresponding store
800 // instructions.
801 unsigned MemBits = Src->getPrimitiveSizeInBits();
802 unsigned SrcBytes = LT.second.getStoreSize();
803 if (ST->hasVSX() && IsAltivecType) {
804 if (MemBits == 64 || (ST->hasP8Vector() && MemBits == 32))
805 return 1;
806
807 // Use lfiwax/xxspltw
808 Align AlignBytes = Alignment ? *Alignment : Align(1);
809 if (Opcode == Instruction::Load && MemBits == 32 && AlignBytes < SrcBytes)
810 return 2;
811 }
812
813 // Aligned loads and stores are easy.
814 if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
815 return Cost;
816
817 // If we can use the permutation-based load sequence, then this is also
818 // relatively cheap (not counting loop-invariant instructions): one load plus
819 // one permute (the last load in a series has extra cost, but we're
820 // neglecting that here). Note that on the P7, we could do unaligned loads
821 // for Altivec types using the VSX instructions, but that's more expensive
822 // than using the permutation-based load sequence. On the P8, that's no
823 // longer true.
824 if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
825 *Alignment >= LT.second.getScalarType().getStoreSize())
826 return Cost + LT.first; // Add the cost of the permutations.
827
828 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
829 // P7, unaligned vector loads are more expensive than the permutation-based
830 // load sequence, so that might be used instead, but regardless, the net cost
831 // is about the same (not counting loop-invariant instructions).
832 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
833 return Cost;
834
835 // Newer PPC supports unaligned memory access.
836 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
837 return Cost;
838
839 // PPC in general does not support unaligned loads and stores. They'll need
840 // to be decomposed based on the alignment factor.
841
842 // Add the cost of each scalar load or store.
843 assert(Alignment);
844 Cost += LT.first * ((SrcBytes / Alignment->value()) - 1);
845
846 // For a vector type, there is also scalarization overhead (only for
847 // stores, loads are expanded using the vector-load + permutation sequence,
848 // which is much less expensive).
849 if (Src->isVectorTy() && Opcode == Instruction::Store)
850 for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
851 ++i)
852 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i,
853 nullptr, nullptr);
854
855 return Cost;
856}
857
859 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
860 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
861 bool UseMaskForCond, bool UseMaskForGaps) {
862 InstructionCost CostFactor =
863 vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
864 if (!CostFactor.isValid())
866
867 if (UseMaskForCond || UseMaskForGaps)
868 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
869 Alignment, AddressSpace, CostKind,
870 UseMaskForCond, UseMaskForGaps);
871
872 assert(isa<VectorType>(VecTy) &&
873 "Expect a vector type for interleaved memory op");
874
875 // Legalize the type.
876 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
877
878 // Firstly, the cost of load/store operation.
879 InstructionCost Cost = getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment),
881
882 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
883 // (at least in the sense that there need only be one non-loop-invariant
884 // instruction). For each result vector, we need one shuffle per incoming
885 // vector (except that the first shuffle can take two incoming vectors
886 // because it does not need to take itself).
887 Cost += Factor*(LT.first-1);
888
889 return Cost;
890}
891
896}
897
899 const Function *Callee,
900 const ArrayRef<Type *> &Types) const {
901
902 // We need to ensure that argument promotion does not
903 // attempt to promote pointers to MMA types (__vector_pair
904 // and __vector_quad) since these types explicitly cannot be
905 // passed as arguments. Both of these types are larger than
906 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
907 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
908 return false;
909
910 return llvm::none_of(Types, [](Type *Ty) {
911 if (Ty->isSized())
912 return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
913 return false;
914 });
915}
916
918 LoopInfo *LI, DominatorTree *DT,
919 AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
920 // Process nested loops first.
921 for (Loop *I : *L)
922 if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
923 return false; // Stop search.
924
925 HardwareLoopInfo HWLoopInfo(L);
926
927 if (!HWLoopInfo.canAnalyze(*LI))
928 return false;
929
930 if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
931 return false;
932
933 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
934 return false;
935
936 *BI = HWLoopInfo.ExitBranch;
937 return true;
938}
939
942 // PowerPC default behaviour here is "instruction number 1st priority".
943 // If LsrNoInsnsCost is set, call default implementation.
944 if (!LsrNoInsnsCost)
945 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
946 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
947 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
948 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
949 else
951}
952
954 return false;
955}
956
958 const PPCTargetMachine &TM = ST->getTargetMachine();
959 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
960 if (!TM.isELFv2ABI())
961 return false;
963}
964
966 MemIntrinsicInfo &Info) {
967 switch (Inst->getIntrinsicID()) {
968 case Intrinsic::ppc_altivec_lvx:
969 case Intrinsic::ppc_altivec_lvxl:
970 case Intrinsic::ppc_altivec_lvebx:
971 case Intrinsic::ppc_altivec_lvehx:
972 case Intrinsic::ppc_altivec_lvewx:
973 case Intrinsic::ppc_vsx_lxvd2x:
974 case Intrinsic::ppc_vsx_lxvw4x:
975 case Intrinsic::ppc_vsx_lxvd2x_be:
976 case Intrinsic::ppc_vsx_lxvw4x_be:
977 case Intrinsic::ppc_vsx_lxvl:
978 case Intrinsic::ppc_vsx_lxvll:
979 case Intrinsic::ppc_vsx_lxvp: {
980 Info.PtrVal = Inst->getArgOperand(0);
981 Info.ReadMem = true;
982 Info.WriteMem = false;
983 return true;
984 }
985 case Intrinsic::ppc_altivec_stvx:
986 case Intrinsic::ppc_altivec_stvxl:
987 case Intrinsic::ppc_altivec_stvebx:
988 case Intrinsic::ppc_altivec_stvehx:
989 case Intrinsic::ppc_altivec_stvewx:
990 case Intrinsic::ppc_vsx_stxvd2x:
991 case Intrinsic::ppc_vsx_stxvw4x:
992 case Intrinsic::ppc_vsx_stxvd2x_be:
993 case Intrinsic::ppc_vsx_stxvw4x_be:
994 case Intrinsic::ppc_vsx_stxvl:
995 case Intrinsic::ppc_vsx_stxvll:
996 case Intrinsic::ppc_vsx_stxvp: {
997 Info.PtrVal = Inst->getArgOperand(1);
998 Info.ReadMem = false;
999 Info.WriteMem = true;
1000 return true;
1001 }
1002 case Intrinsic::ppc_stbcx:
1003 case Intrinsic::ppc_sthcx:
1004 case Intrinsic::ppc_stdcx:
1005 case Intrinsic::ppc_stwcx: {
1006 Info.PtrVal = Inst->getArgOperand(0);
1007 Info.ReadMem = false;
1008 Info.WriteMem = true;
1009 return true;
1010 }
1011 default:
1012 break;
1013 }
1014
1015 return false;
1016}
1017
1018bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType,
1019 Align Alignment) const {
1020 // Only load and stores instructions can have variable vector length on Power.
1021 if (Opcode != Instruction::Load && Opcode != Instruction::Store)
1022 return false;
1023 // Loads/stores with length instructions use bits 0-7 of the GPR operand and
1024 // therefore cannot be used in 32-bit mode.
1025 if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64())
1026 return false;
1027 if (isa<FixedVectorType>(DataType)) {
1028 unsigned VecWidth = DataType->getPrimitiveSizeInBits();
1029 return VecWidth == 128;
1030 }
1031 Type *ScalarTy = DataType->getScalarType();
1032
1033 if (ScalarTy->isPointerTy())
1034 return true;
1035
1036 if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
1037 return true;
1038
1039 if (!ScalarTy->isIntegerTy())
1040 return false;
1041
1042 unsigned IntWidth = ScalarTy->getIntegerBitWidth();
1043 return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64;
1044}
1045
1047 Align Alignment,
1048 unsigned AddressSpace,
1050 const Instruction *I) {
1051 InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment,
1053 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1054 return Cost;
1055 // TODO: Handle other cost kinds.
1057 return Cost;
1058
1059 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1060 "Invalid Opcode");
1061
1062 auto *SrcVTy = dyn_cast<FixedVectorType>(Src);
1063 assert(SrcVTy && "Expected a vector type for VP memory operations");
1064
1065 if (hasActiveVectorLength(Opcode, Src, Alignment)) {
1066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
1067
1068 InstructionCost CostFactor =
1069 vectorCostAdjustmentFactor(Opcode, Src, nullptr);
1070 if (!CostFactor.isValid())
1071 return InstructionCost::getMax();
1072
1073 InstructionCost Cost = LT.first * CostFactor;
1074 assert(Cost.isValid() && "Expected valid cost");
1075
1076 // On P9 but not on P10, if the op is misaligned then it will cause a
1077 // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked
1078 // ones.
1079 const Align DesiredAlignment(16);
1080 if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9)
1081 return Cost;
1082
1083 // Since alignment may be under estimated, we try to compute the probability
1084 // that the actual address is aligned to the desired boundary. For example
1085 // an 8-byte aligned load is assumed to be actually 16-byte aligned half the
1086 // time, while a 4-byte aligned load has a 25% chance of being 16-byte
1087 // aligned.
1088 float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value();
1089 float MisalignmentProb = 1.0 - AlignmentProb;
1090 return (MisalignmentProb * P9PipelineFlushEstimate) +
1091 (AlignmentProb * *Cost.getValue());
1092 }
1093
1094 // Usually we should not get to this point, but the following is an attempt to
1095 // model the cost of legalization. Currently we can only lower intrinsics with
1096 // evl but no mask, on Power 9/10. Otherwise, we must scalarize.
1097 return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1098}
1099
1101 return TLI->supportsTailCallFor(CB);
1102}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
Machine Trace Metrics
uint64_t IntrinsicInst * II
static cl::opt< bool > VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden)
static cl::opt< bool > DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden)
static cl::opt< unsigned > SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, cl::desc("Loops with a constant trip count smaller than " "this value will not use the count register."))
static bool isMMAType(Type *Ty)
static cl::opt< bool > EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions"))
static cl::opt< bool > LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false), cl::desc("Do not add instruction count to lsr cost model"))
This file a TargetTransformInfo::Concept conforming object specific to the PPC target machine.
This file contains the declarations for profiling metadata utility functions.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:596
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:668
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:896
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:932
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1294
static CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
This is an important base class in LLVM.
Definition: Constant.h:42
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:197
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2503
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2491
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2155
The core instruction combiner logic.
Definition: InstCombiner.h:48
const DataLayout & getDataLayout() const
Definition: InstCombiner.h:343
DominatorTree & getDominatorTree() const
Definition: InstCombiner.h:342
BuilderTy & Builder
Definition: InstCombiner.h:61
AssumptionCache & getAssumptionCache() const
Definition: InstCombiner.h:340
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:176
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
The optimization diagnostic interface.
bool isPPC64() const
isPPC64 - Return true if we are generating code for 64-bit pointer mode.
unsigned getCPUDirective() const
getCPUDirective - Returns the -m directive specified for the cpu.
Definition: PPCSubtarget.h:139
POPCNTDKind hasPOPCNTD() const
Definition: PPCSubtarget.h:211
bool isLittleEndian() const
Definition: PPCSubtarget.h:186
const PPCTargetMachine & getTargetMachine() const
Definition: PPCSubtarget.h:160
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
const char * getRegisterClassName(unsigned ClassID) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, Type *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool useColdCCForColdCall(Function &F)
InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1, Type *Ty2)
bool supportsTailCallFor(const CallBase *CB) const
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
unsigned getCacheLineSize() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getMaxInterleaveFactor(ElementCount VF)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool enableAggressiveInterleaving(bool LoopHasReductions)
unsigned getPrefetchDistance() const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
unsigned getNumberOfRegisters(unsigned ClassID) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
bool shouldBuildRelLookupTables() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo)
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool supportsTailCallFor(const CallBase *CB) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Is unaligned memory access allowed for the given type, and is it fast relative to software emulation.
Common code between 32-bit and 64-bit PowerPC targets.
The main scalar evolution driver.
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
Provide an instruction scheduling machine model to CodeGen passes.
unsigned getIssueWidth() const
Maximum number of micro-ops that may be scheduled per cycle.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind)
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
CastContextHint
Represents a hint about the context in which a cast is used.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:310
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:142
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:156
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:550
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:539
@ DIR_PWR_FUTURE
Definition: PPCSubtarget.h:65
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition: MathExtras.h:279
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition: Local.cpp:1581
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition: CodeMetrics.h:33
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
Attributes of a target dependent hardware loop.
bool canAnalyze(LoopInfo &LI)
bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...