LLVM 23.0.0git
PPCTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
16#include "llvm/IR/IntrinsicsPowerPC.h"
21#include <optional>
22
23using namespace llvm;
24
25#define DEBUG_TYPE "ppctti"
26
27static cl::opt<bool> PPCEVL("ppc-evl",
28 cl::desc("Allow EVL type vp.load/vp.store"),
29 cl::init(false), cl::Hidden);
30
31static cl::opt<bool> Pwr9EVL("ppc-pwr9-evl",
32 cl::desc("Allow vp.load and vp.store for pwr9"),
33 cl::init(false), cl::Hidden);
34
35static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
36cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
37
38static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
39cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
40
41static cl::opt<bool>
42EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
43 cl::desc("Enable using coldcc calling conv for cold "
44 "internal functions"));
45
46static cl::opt<bool>
47LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
48 cl::desc("Do not add instruction count to lsr cost model"));
49
50// The latency of mtctr is only justified if there are more than 4
51// comparisons that will be removed as a result.
53SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
54 cl::desc("Loops with a constant trip count smaller than "
55 "this value will not use the count register."));
56
57//===----------------------------------------------------------------------===//
58//
59// PPC cost model.
60//
61//===----------------------------------------------------------------------===//
62
64PPCTTIImpl::getPopcntSupport(unsigned TyWidth) const {
65 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
66 if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
67 return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
69 return TTI::PSK_Software;
70}
71
72std::optional<Instruction *>
74 Intrinsic::ID IID = II.getIntrinsicID();
75 switch (IID) {
76 default:
77 break;
78 case Intrinsic::ppc_altivec_lvx:
79 case Intrinsic::ppc_altivec_lvxl:
80 // Turn PPC lvx -> load if the pointer is known aligned.
82 II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
83 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
84 Value *Ptr = II.getArgOperand(0);
85 return new LoadInst(II.getType(), Ptr, "", false, Align(16));
86 }
87 break;
88 case Intrinsic::ppc_vsx_lxvw4x:
89 case Intrinsic::ppc_vsx_lxvd2x: {
90 // Turn PPC VSX loads into normal loads.
91 Value *Ptr = II.getArgOperand(0);
92 return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
93 }
94 case Intrinsic::ppc_altivec_stvx:
95 case Intrinsic::ppc_altivec_stvxl:
96 // Turn stvx -> store if the pointer is known aligned.
98 II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
99 &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
100 Value *Ptr = II.getArgOperand(1);
101 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
102 }
103 break;
104 case Intrinsic::ppc_vsx_stxvw4x:
105 case Intrinsic::ppc_vsx_stxvd2x: {
106 // Turn PPC VSX stores into normal stores.
107 Value *Ptr = II.getArgOperand(1);
108 return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
109 }
110 case Intrinsic::ppc_altivec_vperm:
111 // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
112 // Note that ppc_altivec_vperm has a big-endian bias, so when creating
113 // a vectorshuffle for little endian, we must undo the transformation
114 // performed on vec_perm in altivec.h. That is, we must complement
115 // the permutation mask with respect to 31 and reverse the order of
116 // V1 and V2.
117 if (Constant *Mask = dyn_cast<Constant>(II.getArgOperand(2))) {
118 assert(cast<FixedVectorType>(Mask->getType())->getNumElements() == 16 &&
119 "Bad type for intrinsic!");
120
121 // Check that all of the elements are integer constants or undefs.
122 bool AllEltsOk = true;
123 for (unsigned I = 0; I != 16; ++I) {
124 Constant *Elt = Mask->getAggregateElement(I);
125 if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
126 AllEltsOk = false;
127 break;
128 }
129 }
130
131 if (AllEltsOk) {
132 // Cast the input vectors to byte vectors.
133 Value *Op0 =
134 IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType());
135 Value *Op1 =
136 IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType());
137 Value *Result = PoisonValue::get(Op0->getType());
138
139 // Only extract each element once.
140 Value *ExtractedElts[32];
141 memset(ExtractedElts, 0, sizeof(ExtractedElts));
142
143 for (unsigned I = 0; I != 16; ++I) {
144 if (isa<UndefValue>(Mask->getAggregateElement(I)))
145 continue;
146 unsigned Idx =
147 cast<ConstantInt>(Mask->getAggregateElement(I))->getZExtValue();
148 Idx &= 31; // Match the hardware behavior.
149 if (DL.isLittleEndian())
150 Idx = 31 - Idx;
151
152 if (!ExtractedElts[Idx]) {
153 Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
154 Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
155 ExtractedElts[Idx] = IC.Builder.CreateExtractElement(
156 Idx < 16 ? Op0ToUse : Op1ToUse, IC.Builder.getInt32(Idx & 15));
157 }
158
159 // Insert this value into the result vector.
160 Result = IC.Builder.CreateInsertElement(Result, ExtractedElts[Idx],
161 IC.Builder.getInt32(I));
162 }
163 return CastInst::Create(Instruction::BitCast, Result, II.getType());
164 }
165 }
166 break;
167 }
168 return std::nullopt;
169}
170
174 return BaseT::getIntImmCost(Imm, Ty, CostKind);
175
176 assert(Ty->isIntegerTy());
177
178 unsigned BitSize = Ty->getPrimitiveSizeInBits();
179 if (BitSize == 0)
180 return ~0U;
181
182 if (Imm == 0)
183 return TTI::TCC_Free;
184
185 if (Imm.getBitWidth() <= 64) {
186 if (isInt<16>(Imm.getSExtValue()))
187 return TTI::TCC_Basic;
188
189 if (isInt<32>(Imm.getSExtValue())) {
190 // A constant that can be materialized using lis.
191 if ((Imm.getZExtValue() & 0xFFFF) == 0)
192 return TTI::TCC_Basic;
193
194 return 2 * TTI::TCC_Basic;
195 }
196 }
197
198 return 4 * TTI::TCC_Basic;
199}
200
203 const APInt &Imm, Type *Ty,
206 return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
207
208 assert(Ty->isIntegerTy());
209
210 unsigned BitSize = Ty->getPrimitiveSizeInBits();
211 if (BitSize == 0)
212 return ~0U;
213
214 switch (IID) {
215 default:
216 return TTI::TCC_Free;
217 case Intrinsic::sadd_with_overflow:
218 case Intrinsic::uadd_with_overflow:
219 case Intrinsic::ssub_with_overflow:
220 case Intrinsic::usub_with_overflow:
221 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
222 return TTI::TCC_Free;
223 break;
224 case Intrinsic::experimental_stackmap:
225 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
226 return TTI::TCC_Free;
227 break;
228 case Intrinsic::experimental_patchpoint_void:
229 case Intrinsic::experimental_patchpoint:
230 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
231 return TTI::TCC_Free;
232 break;
233 }
234 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
235}
236
237InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
238 const APInt &Imm, Type *Ty,
240 Instruction *Inst) const {
242 return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind, Inst);
243
244 assert(Ty->isIntegerTy());
245
246 unsigned BitSize = Ty->getPrimitiveSizeInBits();
247 if (BitSize == 0)
248 return ~0U;
249
250 unsigned ImmIdx = ~0U;
251 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
252 ZeroFree = false;
253 switch (Opcode) {
254 default:
255 return TTI::TCC_Free;
256 case Instruction::GetElementPtr:
257 // Always hoist the base address of a GetElementPtr. This prevents the
258 // creation of new constants for every base constant that gets constant
259 // folded with the offset.
260 if (Idx == 0)
261 return 2 * TTI::TCC_Basic;
262 return TTI::TCC_Free;
263 case Instruction::And:
264 RunFree = true; // (for the rotate-and-mask instructions)
265 [[fallthrough]];
266 case Instruction::Add:
267 case Instruction::Or:
268 case Instruction::Xor:
269 ShiftedFree = true;
270 [[fallthrough]];
271 case Instruction::Sub:
272 case Instruction::Mul:
273 case Instruction::Shl:
274 case Instruction::LShr:
275 case Instruction::AShr:
276 ImmIdx = 1;
277 break;
278 case Instruction::ICmp:
279 UnsignedFree = true;
280 ImmIdx = 1;
281 // Zero comparisons can use record-form instructions.
282 [[fallthrough]];
283 case Instruction::Select:
284 ZeroFree = true;
285 break;
286 case Instruction::PHI:
287 case Instruction::Call:
288 case Instruction::Ret:
289 case Instruction::Load:
290 case Instruction::Store:
291 break;
292 }
293
294 if (ZeroFree && Imm == 0)
295 return TTI::TCC_Free;
296
297 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
298 if (isInt<16>(Imm.getSExtValue()))
299 return TTI::TCC_Free;
300
301 if (RunFree) {
302 if (Imm.getBitWidth() <= 32 &&
303 (isShiftedMask_32(Imm.getZExtValue()) ||
304 isShiftedMask_32(~Imm.getZExtValue())))
305 return TTI::TCC_Free;
306
307 if (ST->isPPC64() &&
308 (isShiftedMask_64(Imm.getZExtValue()) ||
309 isShiftedMask_64(~Imm.getZExtValue())))
310 return TTI::TCC_Free;
311 }
312
313 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
314 return TTI::TCC_Free;
315
316 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
317 return TTI::TCC_Free;
318 }
319
320 return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
321}
322
323// Check if the current Type is an MMA vector type. Valid MMA types are
324// v256i1 and v512i1 respectively.
325static bool isMMAType(Type *Ty) {
326 return Ty->isVectorTy() && (Ty->getScalarSizeInBits() == 1) &&
327 (Ty->getPrimitiveSizeInBits() > 128);
328}
329
333 // We already implement getCastInstrCost and getMemoryOpCost where we perform
334 // the vector adjustment there.
335 if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
336 return BaseT::getInstructionCost(U, Operands, CostKind);
337
338 if (U->getType()->isVectorTy()) {
339 // Instructions that need to be split should cost more.
340 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(U->getType());
341 return LT.first * BaseT::getInstructionCost(U, Operands, CostKind);
342 }
343
344 return BaseT::getInstructionCost(U, Operands, CostKind);
345}
346
348 AssumptionCache &AC,
349 TargetLibraryInfo *LibInfo,
350 HardwareLoopInfo &HWLoopInfo) const {
351 const PPCTargetMachine &TM = ST->getTargetMachine();
352 TargetSchedModel SchedModel;
353 SchedModel.init(ST);
354
355 // FIXME: Sure there is no other way to get TTI? This should be cheap though.
357 TM.getTargetTransformInfo(*L->getHeader()->getParent());
358
359 // Do not convert small short loops to CTR loop.
360 unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
361 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
363 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
365 for (BasicBlock *BB : L->blocks())
366 Metrics.analyzeBasicBlock(BB, TTI, EphValues);
367 // 6 is an approximate latency for the mtctr instruction.
368 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
369 return false;
370 }
371
372 // Check that there is no hardware loop related intrinsics in the loop.
373 for (auto *BB : L->getBlocks())
374 for (auto &I : *BB)
375 if (auto *Call = dyn_cast<IntrinsicInst>(&I))
376 if (Call->getIntrinsicID() == Intrinsic::set_loop_iterations ||
377 Call->getIntrinsicID() == Intrinsic::loop_decrement)
378 return false;
379
380 SmallVector<BasicBlock*, 4> ExitingBlocks;
381 L->getExitingBlocks(ExitingBlocks);
382
383 // If there is an exit edge known to be frequently taken,
384 // we should not transform this loop.
385 for (auto &BB : ExitingBlocks) {
386 Instruction *TI = BB->getTerminator();
387 if (!TI) continue;
388
389 if (CondBrInst *BI = dyn_cast<CondBrInst>(TI)) {
390 uint64_t TrueWeight = 0, FalseWeight = 0;
391 if (!extractBranchWeights(*BI, TrueWeight, FalseWeight))
392 continue;
393
394 // If the exit path is more frequent than the loop path,
395 // we return here without further analysis for this loop.
396 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
397 if (( TrueIsExit && FalseWeight < TrueWeight) ||
398 (!TrueIsExit && FalseWeight > TrueWeight))
399 return false;
400 }
401 }
402
403 LLVMContext &C = L->getHeader()->getContext();
404 HWLoopInfo.CountType = TM.isPPC64() ?
406 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
407 return true;
408}
409
412 OptimizationRemarkEmitter *ORE) const {
413 if (ST->getCPUDirective() == PPC::DIR_A2) {
414 // The A2 is in-order with a deep pipeline, and concatenation unrolling
415 // helps expose latency-hiding opportunities to the instruction scheduler.
416 UP.Partial = UP.Runtime = true;
417
418 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
419 // often outweigh the cost of a division to compute the trip count.
420 UP.AllowExpensiveTripCount = true;
421 }
422
423 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
424}
425
430// This function returns true to allow using coldcc calling convention.
431// Returning true results in coldcc being used for functions which are cold at
432// all call sites when the callers of the functions are not calling any other
433// non coldcc functions.
437
438bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) const {
439 // On the A2, always unroll aggressively.
440 if (ST->getCPUDirective() == PPC::DIR_A2)
441 return true;
442
443 return LoopHasReductions;
444}
445
447PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
449 if (getST()->hasAltivec())
450 Options.LoadSizes = {16, 8, 4, 2, 1};
451 else
452 Options.LoadSizes = {8, 4, 2, 1};
453
454 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
455 return Options;
456}
457
459
460unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
461 assert(ClassID == GPRRC || ClassID == FPRRC ||
462 ClassID == VRRC || ClassID == VSXRC);
463 if (ST->hasVSX()) {
464 assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
465 return ClassID == VSXRC ? 64 : 32;
466 }
467 assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC);
468 return 32;
469}
470
472 if (Vector)
473 return ST->hasVSX() ? VSXRC : VRRC;
474 if (Ty &&
475 (Ty->getScalarType()->isFloatTy() || Ty->getScalarType()->isDoubleTy()))
476 return ST->hasVSX() ? VSXRC : FPRRC;
477 if (Ty && (Ty->getScalarType()->isFP128Ty() ||
478 Ty->getScalarType()->isPPC_FP128Ty()))
479 return VRRC;
480 if (Ty && Ty->getScalarType()->isHalfTy())
481 return VSXRC;
482 return GPRRC;
483}
484
485const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
486
487 switch (ClassID) {
488 default:
489 llvm_unreachable("unknown register class");
490 return "PPC::unknown register class";
491 case GPRRC: return "PPC::GPRRC";
492 case FPRRC: return "PPC::FPRRC";
493 case VRRC: return "PPC::VRRC";
494 case VSXRC: return "PPC::VSXRC";
495 }
496}
497
500 switch (K) {
502 return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
504 return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
506 return TypeSize::getScalable(0);
507 }
508
509 llvm_unreachable("Unsupported register kind");
510}
511
513 // Starting with P7 we have a cache line size of 128.
514 unsigned Directive = ST->getCPUDirective();
515 // Assume that Future CPU has the same cache line size as the others.
519 return 128;
520
521 // On other processors return a default of 64 bytes.
522 return 64;
523}
524
526 return 300;
527}
528
530 unsigned Directive = ST->getCPUDirective();
531 // The 440 has no SIMD support, but floating-point instructions
532 // have a 5-cycle latency, so unroll by 5x for latency hiding.
533 if (Directive == PPC::DIR_440)
534 return 5;
535
536 // The A2 has no SIMD support, but floating-point instructions
537 // have a 6-cycle latency, so unroll by 6x for latency hiding.
538 if (Directive == PPC::DIR_A2)
539 return 6;
540
541 // FIXME: For lack of any better information, do no harm...
543 return 1;
544
545 // For P7 and P8, floating-point instructions have a 6-cycle latency and
546 // there are two execution units, so unroll by 12x for latency hiding.
547 // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
548 // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
549 // Assume that future is the same as the others.
553 return 12;
554
555 // For most things, modern systems have two execution units (and
556 // out-of-order execution).
557 return 2;
558}
559
560// Returns a cost adjustment factor to adjust the cost of vector instructions
561// on targets which there is overlap between the vector and scalar units,
562// thereby reducing the overall throughput of vector code wrt. scalar code.
563// An invalid instruction cost is returned if the type is an MMA vector type.
565 Type *Ty1,
566 Type *Ty2) const {
567 // If the vector type is of an MMA type (v256i1, v512i1), an invalid
568 // instruction cost is returned. This is to signify to other cost computing
569 // functions to return the maximum instruction cost in order to prevent any
570 // opportunities for the optimizer to produce MMA types within the IR.
571 if (isMMAType(Ty1))
573
574 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
575 return InstructionCost(1);
576
577 std::pair<InstructionCost, MVT> LT1 = getTypeLegalizationCost(Ty1);
578 // If type legalization involves splitting the vector, we don't want to
579 // double the cost at every step - only the last step.
580 if (LT1.first != 1 || !LT1.second.isVector())
581 return InstructionCost(1);
582
583 int ISD = TLI->InstructionOpcodeToISD(Opcode);
584 if (TLI->isOperationExpand(ISD, LT1.second))
585 return InstructionCost(1);
586
587 if (Ty2) {
588 std::pair<InstructionCost, MVT> LT2 = getTypeLegalizationCost(Ty2);
589 if (LT2.first != 1 || !LT2.second.isVector())
590 return InstructionCost(1);
591 }
592
593 return InstructionCost(2);
594}
595
597 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
599 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
600 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
601
602 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
603 if (!CostFactor.isValid())
605
606 // TODO: Handle more cost kinds.
608 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
609 Op2Info, Args, CxtI);
610
611 // Fallback to the default implementation.
613 Opcode, Ty, CostKind, Op1Info, Op2Info);
614 return Cost * CostFactor;
615}
616
618 VectorType *DstTy, VectorType *SrcTy,
619 ArrayRef<int> Mask,
621 int Index, VectorType *SubTp,
623 const Instruction *CxtI) const {
624
625 InstructionCost CostFactor =
626 vectorCostAdjustmentFactor(Instruction::ShuffleVector, SrcTy, nullptr);
627 if (!CostFactor.isValid())
629
630 // Legalize the type.
631 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
632
633 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
634 // (at least in the sense that there need only be one non-loop-invariant
635 // instruction). We need one such shuffle instruction for each actual
636 // register (this is not true for arbitrary shuffles, but is true for the
637 // structured types of shuffles covered by TTI::ShuffleKind).
638 return LT.first * CostFactor;
639}
640
643 const Instruction *I) const {
645 return Opcode == Instruction::PHI ? 0 : 1;
646 // Branches are assumed to be predicted.
647 return 0;
648}
649
651 Type *Src,
654 const Instruction *I) const {
655 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
656
657 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Dst, Src);
658 if (!CostFactor.isValid())
660
662 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
663 Cost *= CostFactor;
664 // TODO: Allow non-throughput costs that aren't binary.
666 return Cost == 0 ? 0 : 1;
667 return Cost;
668}
669
671 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
673 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
674 InstructionCost CostFactor =
675 vectorCostAdjustmentFactor(Opcode, ValTy, nullptr);
676 if (!CostFactor.isValid())
678
680 Opcode, ValTy, CondTy, VecPred, CostKind, Op1Info, Op2Info, I);
681 // TODO: Handle other cost kinds.
683 return Cost;
684 return Cost * CostFactor;
685}
686
688 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
689 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
690 assert(Val->isVectorTy() && "This must be a vector type");
691
692 int ISD = TLI->InstructionOpcodeToISD(Opcode);
693 assert(ISD && "Invalid opcode");
694
695 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Val, nullptr);
696 if (!CostFactor.isValid())
698
700 BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1, VIC);
701 Cost *= CostFactor;
702
703 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
704 // Double-precision scalars are already located in index #0 (or #1 if LE).
706 Index == (ST->isLittleEndian() ? 1 : 0))
707 return 0;
708
709 return Cost;
710 }
711 if (Val->getScalarType()->isIntegerTy()) {
712 unsigned EltSize = Val->getScalarSizeInBits();
713 // Computing on 1 bit values requires extra mask or compare operations.
714 unsigned MaskCostForOneBitSize = (VecMaskCost && EltSize == 1) ? 1 : 0;
715 // Computing on non const index requires extra mask or compare operations.
716 unsigned MaskCostForIdx = (Index != -1U) ? 0 : 1;
717 if (ST->hasP9Altivec()) {
718 // P10 has vxform insert which can handle non const index. The
719 // MaskCostForIdx is for masking the index.
720 // P9 has insert for const index. A move-to VSR and a permute/insert.
721 // Assume vector operation cost for both (cost will be 2x on P9).
723 if (ST->hasP10Vector())
724 return CostFactor + MaskCostForIdx;
725 if (Index != -1U)
726 return 2 * CostFactor;
727 } else if (ISD == ISD::EXTRACT_VECTOR_ELT) {
728 // It's an extract. Maybe we can do a cheap move-from VSR.
729 unsigned EltSize = Val->getScalarSizeInBits();
730 // P9 has both mfvsrd and mfvsrld for 64 bit integer.
731 if (EltSize == 64 && Index != -1U)
732 return 1;
733 if (EltSize == 32) {
734 unsigned MfvsrwzIndex = ST->isLittleEndian() ? 2 : 1;
735 if (Index == MfvsrwzIndex)
736 return 1;
737
738 // For other indexs like non const, P9 has vxform extract. The
739 // MaskCostForIdx is for masking the index.
740 return CostFactor + MaskCostForIdx;
741 }
742
743 // We need a vector extract (or mfvsrld). Assume vector operation cost.
744 // The cost of the load constant for a vector extract is disregarded
745 // (invariant, easily schedulable).
746 return CostFactor + MaskCostForOneBitSize + MaskCostForIdx;
747 }
748 } else if (ST->hasDirectMove() && Index != -1U) {
749 // Assume permute has standard cost.
750 // Assume move-to/move-from VSR have 2x standard cost.
752 return 3;
753 return 3 + MaskCostForOneBitSize;
754 }
755 }
756
757 // Estimated cost of a load-hit-store delay. This was obtained
758 // experimentally as a minimum needed to prevent unprofitable
759 // vectorization for the paq8p benchmark. It may need to be
760 // raised further if other unprofitable cases remain.
761 unsigned LHSPenalty = 2;
763 LHSPenalty += 7;
764
765 // Vector element insert/extract with Altivec is very expensive,
766 // because they require store and reload with the attendant
767 // processor stall for load-hit-store. Until VSX is available,
768 // these need to be estimated as very costly.
771 return LHSPenalty + Cost;
772
773 return Cost;
774}
775
777 Align Alignment,
778 unsigned AddressSpace,
781 const Instruction *I) const {
782
783 InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr);
784 if (!CostFactor.isValid())
786
787 if (TLI->getValueType(DL, Src, true) == MVT::Other)
788 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
789 CostKind);
790 // Legalize the type.
791 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
792 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
793 "Invalid Opcode");
794
796 BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
797 // TODO: Handle other cost kinds.
799 return Cost;
800
801 Cost *= CostFactor;
802
803 bool IsAltivecType = ST->hasAltivec() &&
804 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
805 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
806 bool IsVSXType = ST->hasVSX() &&
807 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
808
809 // VSX has 32b/64b load instructions. Legalization can handle loading of
810 // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
811 // PPCTargetLowering can't compute the cost appropriately. So here we
812 // explicitly check this case. There are also corresponding store
813 // instructions.
814 unsigned MemBits = Src->getPrimitiveSizeInBits();
815 unsigned SrcBytes = LT.second.getStoreSize();
816 if (ST->hasVSX() && IsAltivecType) {
817 if (MemBits == 64 || (ST->hasP8Vector() && MemBits == 32))
818 return 1;
819
820 // Use lfiwax/xxspltw
821 if (Opcode == Instruction::Load && MemBits == 32 && Alignment < SrcBytes)
822 return 2;
823 }
824
825 // Aligned loads and stores are easy.
826 if (!SrcBytes || Alignment >= SrcBytes)
827 return Cost;
828
829 // If we can use the permutation-based load sequence, then this is also
830 // relatively cheap (not counting loop-invariant instructions): one load plus
831 // one permute (the last load in a series has extra cost, but we're
832 // neglecting that here). Note that on the P7, we could do unaligned loads
833 // for Altivec types using the VSX instructions, but that's more expensive
834 // than using the permutation-based load sequence. On the P8, that's no
835 // longer true.
836 if (Opcode == Instruction::Load && (!ST->hasP8Vector() && IsAltivecType) &&
837 Alignment >= LT.second.getScalarType().getStoreSize())
838 return Cost + LT.first; // Add the cost of the permutations.
839
840 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
841 // P7, unaligned vector loads are more expensive than the permutation-based
842 // load sequence, so that might be used instead, but regardless, the net cost
843 // is about the same (not counting loop-invariant instructions).
844 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
845 return Cost;
846
847 // Newer PPC supports unaligned memory access.
848 if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
849 return Cost;
850
851 // PPC in general does not support unaligned loads and stores. They'll need
852 // to be decomposed based on the alignment factor.
853
854 // Add the cost of each scalar load or store.
855 Cost += LT.first * ((SrcBytes / Alignment.value()) - 1);
856
857 // For a vector type, there is also scalarization overhead (only for
858 // stores, loads are expanded using the vector-load + permutation sequence,
859 // which is much less expensive).
860 if (Src->isVectorTy() && Opcode == Instruction::Store)
861 for (int I = 0, E = cast<FixedVectorType>(Src)->getNumElements(); I < E;
862 ++I)
863 Cost +=
864 getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, I,
865 nullptr, nullptr, TTI::VectorInstrContext::None);
866
867 return Cost;
868}
869
871 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
872 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
873 bool UseMaskForCond, bool UseMaskForGaps) const {
874 InstructionCost CostFactor =
875 vectorCostAdjustmentFactor(Opcode, VecTy, nullptr);
876 if (!CostFactor.isValid())
878
879 if (UseMaskForCond || UseMaskForGaps)
880 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
881 Alignment, AddressSpace, CostKind,
882 UseMaskForCond, UseMaskForGaps);
883
884 assert(isa<VectorType>(VecTy) &&
885 "Expect a vector type for interleaved memory op");
886
887 // Legalize the type.
888 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
889
890 // Firstly, the cost of load/store operation.
892 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
893
894 // PPC, for both Altivec/VSX, support cheap arbitrary permutations
895 // (at least in the sense that there need only be one non-loop-invariant
896 // instruction). For each result vector, we need one shuffle per incoming
897 // vector (except that the first shuffle can take two incoming vectors
898 // because it does not need to take itself).
899 Cost += Factor*(LT.first-1);
900
901 return Cost;
902}
903
907
910
911 if (ICA.getID() == Intrinsic::vp_load) {
912 MemIntrinsicCostAttributes MICA(Intrinsic::masked_load, ICA.getReturnType(),
913 Align(1), 0);
915 }
916
917 if (ICA.getID() == Intrinsic::vp_store) {
918 MemIntrinsicCostAttributes MICA(Intrinsic::masked_store,
919 ICA.getArgTypes()[0], Align(1), 0);
921 }
922
924}
925
927 const Function *Callee) const {
928 const TargetMachine &TM = getTLI()->getTargetMachine();
929
930 const FeatureBitset &CallerBits =
931 TM.getSubtargetImpl(*Caller)->getFeatureBits();
932 const FeatureBitset &CalleeBits =
933 TM.getSubtargetImpl(*Callee)->getFeatureBits();
934
935 // Check that targets features are exactly the same. We can revisit to see if
936 // we can improve this.
937 return CallerBits == CalleeBits;
938}
939
941 const Function *Callee,
942 ArrayRef<Type *> Types) const {
943
944 // We need to ensure that argument promotion does not
945 // attempt to promote pointers to MMA types (__vector_pair
946 // and __vector_quad) since these types explicitly cannot be
947 // passed as arguments. Both of these types are larger than
948 // the 128-bit Altivec vectors and have a scalar size of 1 bit.
949 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
950 return false;
951
952 return llvm::none_of(Types, [](Type *Ty) {
953 if (Ty->isSized())
954 return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128;
955 return false;
956 });
957}
958
960 LoopInfo *LI, DominatorTree *DT,
961 AssumptionCache *AC,
962 TargetLibraryInfo *LibInfo) const {
963 // Process nested loops first.
964 for (Loop *I : *L)
965 if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
966 return false; // Stop search.
967
968 HardwareLoopInfo HWLoopInfo(L);
969
970 if (!HWLoopInfo.canAnalyze(*LI))
971 return false;
972
973 if (!isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo))
974 return false;
975
976 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT))
977 return false;
978
979 *BI = HWLoopInfo.ExitBranch;
980 return true;
981}
982
984 const TargetTransformInfo::LSRCost &C2) const {
985 // PowerPC default behaviour here is "instruction number 1st priority".
986 // If LsrNoInsnsCost is set, call default implementation.
987 if (!LsrNoInsnsCost)
988 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
989 C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
990 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
991 C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
993}
994
995bool PPCTTIImpl::isNumRegsMajorCostOfLSR() const { return false; }
996
998 const PPCTargetMachine &TM = ST->getTargetMachine();
999 // XCOFF hasn't implemented lowerRelativeReference, disable non-ELF for now.
1000 if (!TM.isELFv2ABI())
1001 return false;
1003}
1004
1006 MemIntrinsicInfo &Info) const {
1007 switch (Inst->getIntrinsicID()) {
1008 case Intrinsic::ppc_altivec_lvx:
1009 case Intrinsic::ppc_altivec_lvxl:
1010 case Intrinsic::ppc_altivec_lvebx:
1011 case Intrinsic::ppc_altivec_lvehx:
1012 case Intrinsic::ppc_altivec_lvewx:
1013 case Intrinsic::ppc_vsx_lxvd2x:
1014 case Intrinsic::ppc_vsx_lxvw4x:
1015 case Intrinsic::ppc_vsx_lxvd2x_be:
1016 case Intrinsic::ppc_vsx_lxvw4x_be:
1017 case Intrinsic::ppc_vsx_lxvl:
1018 case Intrinsic::ppc_vsx_lxvll:
1019 case Intrinsic::ppc_vsx_lxvp: {
1020 Info.PtrVal = Inst->getArgOperand(0);
1021 Info.ReadMem = true;
1022 Info.WriteMem = false;
1023 return true;
1024 }
1025 case Intrinsic::ppc_altivec_stvx:
1026 case Intrinsic::ppc_altivec_stvxl:
1027 case Intrinsic::ppc_altivec_stvebx:
1028 case Intrinsic::ppc_altivec_stvehx:
1029 case Intrinsic::ppc_altivec_stvewx:
1030 case Intrinsic::ppc_vsx_stxvd2x:
1031 case Intrinsic::ppc_vsx_stxvw4x:
1032 case Intrinsic::ppc_vsx_stxvd2x_be:
1033 case Intrinsic::ppc_vsx_stxvw4x_be:
1034 case Intrinsic::ppc_vsx_stxvl:
1035 case Intrinsic::ppc_vsx_stxvll:
1036 case Intrinsic::ppc_vsx_stxvp: {
1037 Info.PtrVal = Inst->getArgOperand(1);
1038 Info.ReadMem = false;
1039 Info.WriteMem = true;
1040 return true;
1041 }
1042 case Intrinsic::ppc_stbcx:
1043 case Intrinsic::ppc_sthcx:
1044 case Intrinsic::ppc_stdcx:
1045 case Intrinsic::ppc_stwcx: {
1046 Info.PtrVal = Inst->getArgOperand(0);
1047 Info.ReadMem = false;
1048 Info.WriteMem = true;
1049 return true;
1050 }
1051 default:
1052 break;
1053 }
1054
1055 return false;
1056}
1057
1059 return TLI->supportsTailCallFor(CB);
1060}
1061
1062// Target hook used by CodeGen to decide whether to expand vector predication
1063// intrinsics into scalar operations or to use special ISD nodes to represent
1064// them. The Target will not see the intrinsics.
1068 unsigned Directive = ST->getCPUDirective();
1069 VPLegalization DefaultLegalization = BaseT::getVPLegalizationStrategy(PI);
1072 return DefaultLegalization;
1073
1074 if (!ST->isPPC64())
1075 return DefaultLegalization;
1076
1077 unsigned IID = PI.getIntrinsicID();
1078 if (IID != Intrinsic::vp_load && IID != Intrinsic::vp_store)
1079 return DefaultLegalization;
1080
1081 bool IsLoad = IID == Intrinsic::vp_load;
1082 Type *VecTy = IsLoad ? PI.getType() : PI.getOperand(0)->getType();
1083 EVT VT = TLI->getValueType(DL, VecTy, true);
1084 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
1085 VT != MVT::v16i8)
1086 return DefaultLegalization;
1087
1088 auto IsAllTrueMask = [](Value *MaskVal) {
1089 if (Value *SplattedVal = getSplatValue(MaskVal))
1090 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1091 return ConstValue->isAllOnesValue();
1092 return false;
1093 };
1094 unsigned MaskIx = IsLoad ? 1 : 2;
1095 if (!IsAllTrueMask(PI.getOperand(MaskIx)))
1096 return DefaultLegalization;
1097
1099}
1100
1102 if (!PPCEVL || !ST->isPPC64())
1103 return false;
1104 unsigned CPU = ST->getCPUDirective();
1105 return CPU == PPC::DIR_PWR10 || CPU == PPC::DIR_PWR_FUTURE ||
1106 (Pwr9EVL && CPU == PPC::DIR_PWR9);
1107}
1108
1109bool PPCTTIImpl::isLegalMaskedLoad(Type *DataType, Align Alignment,
1110 unsigned AddressSpace,
1111 TTI::MaskKind MaskKind) const {
1112 if (!hasActiveVectorLength())
1113 return false;
1114
1115 auto IsLegalLoadWithLengthType = [](EVT VT) {
1116 if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
1117 return false;
1118 return true;
1119 };
1120
1121 return IsLegalLoadWithLengthType(TLI->getValueType(DL, DataType, true));
1122}
1123
1125 unsigned AddressSpace,
1126 TTI::MaskKind MaskKind) const {
1127 return isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1128}
1129
1133
1135
1136 unsigned Opcode;
1137 switch (MICA.getID()) {
1138 case Intrinsic::masked_load:
1139 Opcode = Instruction::Load;
1140 break;
1141 case Intrinsic::masked_store:
1142 Opcode = Instruction::Store;
1143 break;
1144 default:
1145 return BaseCost;
1146 }
1147
1148 Type *DataTy = MICA.getDataType();
1149 Align Alignment = MICA.getAlignment();
1150 unsigned AddressSpace = MICA.getAddressSpace();
1151
1152 auto VecTy = dyn_cast<FixedVectorType>(DataTy);
1153 if (!VecTy)
1154 return BaseCost;
1155 if (Opcode == Instruction::Load) {
1156 if (!isLegalMaskedLoad(VecTy->getScalarType(), Alignment, AddressSpace))
1157 return BaseCost;
1158 } else {
1159 if (!isLegalMaskedStore(VecTy->getScalarType(), Alignment, AddressSpace))
1160 return BaseCost;
1161 }
1162 if (VecTy->getPrimitiveSizeInBits() > 128)
1163 return BaseCost;
1164
1165 // Cost is 1 (scalar compare) + 1 (scalar select) +
1166 // 1 * vectorCostAdjustmentFactor (vector load with length)
1167 // Maybe + 1 (scalar shift)
1169 1 + 1 + vectorCostAdjustmentFactor(Opcode, DataTy, nullptr);
1170 if (ST->getCPUDirective() != PPC::DIR_PWR_FUTURE ||
1171 VecTy->getScalarSizeInBits() != 8)
1172 Cost += 1; // need shift for length
1173 return Cost;
1174}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
TargetTransformInfo::VPLegalization VPLegalization
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Machine Trace Metrics
uint64_t IntrinsicInst * II
static cl::opt< bool > PPCEVL("ppc-evl", cl::desc("Allow EVL type vp.load/vp.store"), cl::init(false), cl::Hidden)
static cl::opt< bool > VecMaskCost("ppc-vec-mask-cost", cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden)
static cl::opt< bool > Pwr9EVL("ppc-pwr9-evl", cl::desc("Allow vp.load and vp.store for pwr9"), cl::init(false), cl::Hidden)
static cl::opt< bool > DisablePPCConstHoist("disable-ppc-constant-hoisting", cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden)
static cl::opt< unsigned > SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden, cl::desc("Loops with a constant trip count smaller than " "this value will not use the count register."))
static bool isMMAType(Type *Ty)
static cl::opt< bool > EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false), cl::desc("Enable using coldcc calling conv for cold " "internal functions"))
static cl::opt< bool > LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false), cl::desc("Do not add instruction count to lsr cost model"))
This file a TargetTransformInfoImplBase conforming object specific to the PPC target machine.
This file contains the declarations for profiling metadata utility functions.
static unsigned getNumElements(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool shouldBuildRelLookupTables() const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getArgOperand(unsigned i) const
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
Conditional Branch instruction.
This is an important base class in LLVM.
Definition Constant.h:43
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Container class for subtarget features.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2584
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2572
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2199
The core instruction combiner logic.
const DataLayout & getDataLayout() const
DominatorTree & getDominatorTree() const
BuilderTy & Builder
AssumptionCache & getAssumptionCache() const
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
const FeatureBitset & getFeatureBits() const
Information for memory intrinsic cost model.
The optimization diagnostic interface.
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost vectorCostAdjustmentFactor(unsigned Opcode, Type *Ty1, Type *Ty2) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isLegalMaskedLoad(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool enableInterleavedAccessVectorization() const override
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
unsigned getCacheLineSize() const override
bool hasActiveVectorLength() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Get memory intrinsic cost based on arguments.
bool useColdCCForColdCall(Function &F) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
bool isNumRegsMajorCostOfLSR() const override
unsigned getPrefetchDistance() const override
TargetTransformInfo::VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
unsigned getNumberOfRegisters(unsigned ClassID) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
const char * getRegisterClassName(unsigned ClassID) const override
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
bool shouldBuildRelLookupTables() const override
bool supportsTailCallFor(const CallBase *CB) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool canSaveCmp(Loop *L, CondBrInst **BI, ScalarEvolution *SE, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *LibInfo) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool enableAggressiveInterleaving(bool LoopHasReductions) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Common code between 32-bit and 64-bit PowerPC targets.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
The main scalar evolution driver.
LLVM_ABI unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Provide an instruction scheduling machine model to CodeGen passes.
unsigned getIssueWidth() const
Maximum number of micro-ops that may be scheduled per cycle.
LLVM_ABI void init(const TargetSubtargetInfo *TSInfo, bool EnableSModel=true, bool EnableSItins=true)
Initialize the machine model for instruction scheduling.
virtual TargetTransformInfo::VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const
virtual InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
virtual InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const
virtual InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
virtual InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
CastContextHint
Represents a hint about the context in which a cast is used.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
Value * getOperand(unsigned i) const
Definition User.h:207
This is the common base class for vector predication intrinsics.
static LLVM_ABI bool isVPIntrinsic(Intrinsic::ID)
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
Base class of all SIMD vector types.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI Align getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to ensure that the alignment of V is at least PrefAlign bytes.
Definition Local.cpp:1581
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Utility to calculate the size and a few similar metrics for a set of basic blocks.
Definition CodeMetrics.h:34
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Extended Value Type.
Definition ValueTypes.h:35
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...