| File: | build/source/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp |
| Warning: | line 2825, column 21 The result of the left shift is undefined due to shifting by '4294967295', which is greater or equal to the width of type 'int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// | |||
| 2 | // | |||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | |||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
| 6 | // | |||
| 7 | //===----------------------------------------------------------------------===// | |||
| 8 | ||||
| 9 | #include "AArch64TargetTransformInfo.h" | |||
| 10 | #include "AArch64ExpandImm.h" | |||
| 11 | #include "AArch64PerfectShuffle.h" | |||
| 12 | #include "MCTargetDesc/AArch64AddressingModes.h" | |||
| 13 | #include "llvm/Analysis/IVDescriptors.h" | |||
| 14 | #include "llvm/Analysis/LoopInfo.h" | |||
| 15 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
| 16 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
| 17 | #include "llvm/CodeGen/CostTable.h" | |||
| 18 | #include "llvm/CodeGen/TargetLowering.h" | |||
| 19 | #include "llvm/IR/IntrinsicInst.h" | |||
| 20 | #include "llvm/IR/Intrinsics.h" | |||
| 21 | #include "llvm/IR/IntrinsicsAArch64.h" | |||
| 22 | #include "llvm/IR/PatternMatch.h" | |||
| 23 | #include "llvm/Support/Debug.h" | |||
| 24 | #include "llvm/Transforms/InstCombine/InstCombiner.h" | |||
| 25 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" | |||
| 26 | #include <algorithm> | |||
| 27 | #include <optional> | |||
| 28 | using namespace llvm; | |||
| 29 | using namespace llvm::PatternMatch; | |||
| 30 | ||||
| 31 | #define DEBUG_TYPE"aarch64tti" "aarch64tti" | |||
| 32 | ||||
| 33 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", | |||
| 34 | cl::init(true), cl::Hidden); | |||
| 35 | ||||
| 36 | static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), | |||
| 37 | cl::Hidden); | |||
| 38 | ||||
| 39 | static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", | |||
| 40 | cl::init(10), cl::Hidden); | |||
| 41 | ||||
| 42 | namespace { | |||
| 43 | class TailFoldingKind { | |||
| 44 | private: | |||
| 45 | uint8_t Bits = 0; // Currently defaults to disabled. | |||
| 46 | ||||
| 47 | public: | |||
| 48 | enum TailFoldingOpts { | |||
| 49 | TFDisabled = 0x0, | |||
| 50 | TFReductions = 0x01, | |||
| 51 | TFRecurrences = 0x02, | |||
| 52 | TFReverse = 0x04, | |||
| 53 | TFSimple = 0x80, | |||
| 54 | TFAll = TFReductions | TFRecurrences | TFReverse | TFSimple | |||
| 55 | }; | |||
| 56 | ||||
| 57 | void operator=(const std::string &Val) { | |||
| 58 | if (Val.empty()) | |||
| 59 | return; | |||
| 60 | SmallVector<StringRef, 6> TailFoldTypes; | |||
| 61 | StringRef(Val).split(TailFoldTypes, '+', -1, false); | |||
| 62 | for (auto TailFoldType : TailFoldTypes) { | |||
| 63 | if (TailFoldType == "disabled") | |||
| 64 | Bits = 0; | |||
| 65 | else if (TailFoldType == "all") | |||
| 66 | Bits = TFAll; | |||
| 67 | else if (TailFoldType == "default") | |||
| 68 | Bits = 0; // Currently defaults to never tail-folding. | |||
| 69 | else if (TailFoldType == "simple") | |||
| 70 | add(TFSimple); | |||
| 71 | else if (TailFoldType == "reductions") | |||
| 72 | add(TFReductions); | |||
| 73 | else if (TailFoldType == "recurrences") | |||
| 74 | add(TFRecurrences); | |||
| 75 | else if (TailFoldType == "reverse") | |||
| 76 | add(TFReverse); | |||
| 77 | else if (TailFoldType == "noreductions") | |||
| 78 | remove(TFReductions); | |||
| 79 | else if (TailFoldType == "norecurrences") | |||
| 80 | remove(TFRecurrences); | |||
| 81 | else if (TailFoldType == "noreverse") | |||
| 82 | remove(TFReverse); | |||
| 83 | else { | |||
| 84 | errs() | |||
| 85 | << "invalid argument " << TailFoldType.str() | |||
| 86 | << " to -sve-tail-folding=; each element must be one of: disabled, " | |||
| 87 | "all, default, simple, reductions, noreductions, recurrences, " | |||
| 88 | "norecurrences\n"; | |||
| 89 | } | |||
| 90 | } | |||
| 91 | } | |||
| 92 | ||||
| 93 | operator uint8_t() const { return Bits; } | |||
| 94 | ||||
| 95 | void add(uint8_t Flag) { Bits |= Flag; } | |||
| 96 | void remove(uint8_t Flag) { Bits &= ~Flag; } | |||
| 97 | }; | |||
| 98 | } // namespace | |||
| 99 | ||||
| 100 | TailFoldingKind TailFoldingKindLoc; | |||
| 101 | ||||
| 102 | cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding( | |||
| 103 | "sve-tail-folding", | |||
| 104 | cl::desc( | |||
| 105 | "Control the use of vectorisation using tail-folding for SVE:" | |||
| 106 | "\ndisabled No loop types will vectorize using tail-folding" | |||
| 107 | "\ndefault Uses the default tail-folding settings for the target " | |||
| 108 | "CPU" | |||
| 109 | "\nall All legal loop types will vectorize using tail-folding" | |||
| 110 | "\nsimple Use tail-folding for simple loops (not reductions or " | |||
| 111 | "recurrences)" | |||
| 112 | "\nreductions Use tail-folding for loops containing reductions" | |||
| 113 | "\nrecurrences Use tail-folding for loops containing fixed order " | |||
| 114 | "recurrences" | |||
| 115 | "\nreverse Use tail-folding for loops requiring reversed " | |||
| 116 | "predicates"), | |||
| 117 | cl::location(TailFoldingKindLoc)); | |||
| 118 | ||||
| 119 | // Experimental option that will only be fully functional when the | |||
| 120 | // code-generator is changed to use SVE instead of NEON for all fixed-width | |||
| 121 | // operations. | |||
| 122 | static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( | |||
| 123 | "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden); | |||
| 124 | ||||
| 125 | // Experimental option that will only be fully functional when the cost-model | |||
| 126 | // and code-generator have been changed to avoid using scalable vector | |||
| 127 | // instructions that are not legal in streaming SVE mode. | |||
| 128 | static cl::opt<bool> EnableScalableAutovecInStreamingMode( | |||
| 129 | "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); | |||
| 130 | ||||
| 131 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, | |||
| 132 | const Function *Callee) const { | |||
| 133 | SMEAttrs CallerAttrs(*Caller); | |||
| 134 | SMEAttrs CalleeAttrs(*Callee); | |||
| 135 | if (CallerAttrs.requiresSMChange(CalleeAttrs, | |||
| 136 | /*BodyOverridesInterface=*/true) || | |||
| 137 | CallerAttrs.requiresLazySave(CalleeAttrs) || | |||
| 138 | CalleeAttrs.hasNewZAInterface()) | |||
| 139 | return false; | |||
| 140 | ||||
| 141 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
| 142 | ||||
| 143 | const FeatureBitset &CallerBits = | |||
| 144 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
| 145 | const FeatureBitset &CalleeBits = | |||
| 146 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
| 147 | ||||
| 148 | // Inline a callee if its target-features are a subset of the callers | |||
| 149 | // target-features. | |||
| 150 | return (CallerBits & CalleeBits) == CalleeBits; | |||
| 151 | } | |||
| 152 | ||||
| 153 | bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( | |||
| 154 | TargetTransformInfo::RegisterKind K) const { | |||
| 155 | assert(K != TargetTransformInfo::RGK_Scalar)(static_cast <bool> (K != TargetTransformInfo::RGK_Scalar ) ? void (0) : __assert_fail ("K != TargetTransformInfo::RGK_Scalar" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 155 , __extension__ __PRETTY_FUNCTION__)); | |||
| 156 | return K == TargetTransformInfo::RGK_FixedWidthVector; | |||
| 157 | } | |||
| 158 | ||||
| 159 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
| 160 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
| 161 | /// is valid to return a cost of ZERO. | |||
| 162 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { | |||
| 163 | // Check if the immediate can be encoded within an instruction. | |||
| 164 | if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) | |||
| 165 | return 0; | |||
| 166 | ||||
| 167 | if (Val < 0) | |||
| 168 | Val = ~Val; | |||
| 169 | ||||
| 170 | // Calculate how many moves we will need to materialize this constant. | |||
| 171 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; | |||
| 172 | AArch64_IMM::expandMOVImm(Val, 64, Insn); | |||
| 173 | return Insn.size(); | |||
| 174 | } | |||
| 175 | ||||
| 176 | /// Calculate the cost of materializing the given constant. | |||
| 177 | InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
| 178 | TTI::TargetCostKind CostKind) { | |||
| 179 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 179, __extension__ __PRETTY_FUNCTION__)); | |||
| 180 | ||||
| 181 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
| 182 | if (BitSize == 0) | |||
| 183 | return ~0U; | |||
| 184 | ||||
| 185 | // Sign-extend all constants to a multiple of 64-bit. | |||
| 186 | APInt ImmVal = Imm; | |||
| 187 | if (BitSize & 0x3f) | |||
| 188 | ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); | |||
| 189 | ||||
| 190 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
| 191 | // chunk. | |||
| 192 | InstructionCost Cost = 0; | |||
| 193 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
| 194 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
| 195 | int64_t Val = Tmp.getSExtValue(); | |||
| 196 | Cost += getIntImmCost(Val); | |||
| 197 | } | |||
| 198 | // We need at least one instruction to materialze the constant. | |||
| 199 | return std::max<InstructionCost>(1, Cost); | |||
| 200 | } | |||
| 201 | ||||
| 202 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
| 203 | const APInt &Imm, Type *Ty, | |||
| 204 | TTI::TargetCostKind CostKind, | |||
| 205 | Instruction *Inst) { | |||
| 206 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 206, __extension__ __PRETTY_FUNCTION__)); | |||
| 207 | ||||
| 208 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
| 209 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
| 210 | // here, so that constant hoisting will ignore this constant. | |||
| 211 | if (BitSize == 0) | |||
| 212 | return TTI::TCC_Free; | |||
| 213 | ||||
| 214 | unsigned ImmIdx = ~0U; | |||
| 215 | switch (Opcode) { | |||
| 216 | default: | |||
| 217 | return TTI::TCC_Free; | |||
| 218 | case Instruction::GetElementPtr: | |||
| 219 | // Always hoist the base address of a GetElementPtr. | |||
| 220 | if (Idx == 0) | |||
| 221 | return 2 * TTI::TCC_Basic; | |||
| 222 | return TTI::TCC_Free; | |||
| 223 | case Instruction::Store: | |||
| 224 | ImmIdx = 0; | |||
| 225 | break; | |||
| 226 | case Instruction::Add: | |||
| 227 | case Instruction::Sub: | |||
| 228 | case Instruction::Mul: | |||
| 229 | case Instruction::UDiv: | |||
| 230 | case Instruction::SDiv: | |||
| 231 | case Instruction::URem: | |||
| 232 | case Instruction::SRem: | |||
| 233 | case Instruction::And: | |||
| 234 | case Instruction::Or: | |||
| 235 | case Instruction::Xor: | |||
| 236 | case Instruction::ICmp: | |||
| 237 | ImmIdx = 1; | |||
| 238 | break; | |||
| 239 | // Always return TCC_Free for the shift value of a shift instruction. | |||
| 240 | case Instruction::Shl: | |||
| 241 | case Instruction::LShr: | |||
| 242 | case Instruction::AShr: | |||
| 243 | if (Idx == 1) | |||
| 244 | return TTI::TCC_Free; | |||
| 245 | break; | |||
| 246 | case Instruction::Trunc: | |||
| 247 | case Instruction::ZExt: | |||
| 248 | case Instruction::SExt: | |||
| 249 | case Instruction::IntToPtr: | |||
| 250 | case Instruction::PtrToInt: | |||
| 251 | case Instruction::BitCast: | |||
| 252 | case Instruction::PHI: | |||
| 253 | case Instruction::Call: | |||
| 254 | case Instruction::Select: | |||
| 255 | case Instruction::Ret: | |||
| 256 | case Instruction::Load: | |||
| 257 | break; | |||
| 258 | } | |||
| 259 | ||||
| 260 | if (Idx == ImmIdx) { | |||
| 261 | int NumConstants = (BitSize + 63) / 64; | |||
| 262 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 263 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
| 264 | ? static_cast<int>(TTI::TCC_Free) | |||
| 265 | : Cost; | |||
| 266 | } | |||
| 267 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 268 | } | |||
| 269 | ||||
| 270 | InstructionCost | |||
| 271 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | |||
| 272 | const APInt &Imm, Type *Ty, | |||
| 273 | TTI::TargetCostKind CostKind) { | |||
| 274 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp" , 274, __extension__ __PRETTY_FUNCTION__)); | |||
| 275 | ||||
| 276 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
| 277 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
| 278 | // here, so that constant hoisting will ignore this constant. | |||
| 279 | if (BitSize == 0) | |||
| 280 | return TTI::TCC_Free; | |||
| 281 | ||||
| 282 | // Most (all?) AArch64 intrinsics do not support folding immediates into the | |||
| 283 | // selected instruction, so we compute the materialization cost for the | |||
| 284 | // immediate directly. | |||
| 285 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) | |||
| 286 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 287 | ||||
| 288 | switch (IID) { | |||
| 289 | default: | |||
| 290 | return TTI::TCC_Free; | |||
| 291 | case Intrinsic::sadd_with_overflow: | |||
| 292 | case Intrinsic::uadd_with_overflow: | |||
| 293 | case Intrinsic::ssub_with_overflow: | |||
| 294 | case Intrinsic::usub_with_overflow: | |||
| 295 | case Intrinsic::smul_with_overflow: | |||
| 296 | case Intrinsic::umul_with_overflow: | |||
| 297 | if (Idx == 1) { | |||
| 298 | int NumConstants = (BitSize + 63) / 64; | |||
| 299 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 300 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
| 301 | ? static_cast<int>(TTI::TCC_Free) | |||
| 302 | : Cost; | |||
| 303 | } | |||
| 304 | break; | |||
| 305 | case Intrinsic::experimental_stackmap: | |||
| 306 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
| 307 | return TTI::TCC_Free; | |||
| 308 | break; | |||
| 309 | case Intrinsic::experimental_patchpoint_void: | |||
| 310 | case Intrinsic::experimental_patchpoint_i64: | |||
| 311 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
| 312 | return TTI::TCC_Free; | |||
| 313 | break; | |||
| 314 | case Intrinsic::experimental_gc_statepoint: | |||
| 315 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) | |||
| 316 | return TTI::TCC_Free; | |||
| 317 | break; | |||
| 318 | } | |||
| 319 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 320 | } | |||
| 321 | ||||
| 322 | TargetTransformInfo::PopcntSupportKind | |||
| 323 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
| 324 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 324 , __extension__ __PRETTY_FUNCTION__)); | |||
| 325 | if (TyWidth == 32 || TyWidth == 64) | |||
| 326 | return TTI::PSK_FastHardware; | |||
| 327 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. | |||
| 328 | return TTI::PSK_Software; | |||
| 329 | } | |||
| 330 | ||||
| 331 | InstructionCost | |||
| 332 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
| 333 | TTI::TargetCostKind CostKind) { | |||
| 334 | auto *RetTy = ICA.getReturnType(); | |||
| 335 | switch (ICA.getID()) { | |||
| 336 | case Intrinsic::umin: | |||
| 337 | case Intrinsic::umax: | |||
| 338 | case Intrinsic::smin: | |||
| 339 | case Intrinsic::smax: { | |||
| 340 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
| 341 | MVT::v8i16, MVT::v2i32, MVT::v4i32}; | |||
| 342 | auto LT = getTypeLegalizationCost(RetTy); | |||
| 343 | // v2i64 types get converted to cmp+bif hence the cost of 2 | |||
| 344 | if (LT.second == MVT::v2i64) | |||
| 345 | return LT.first * 2; | |||
| 346 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) | |||
| 347 | return LT.first; | |||
| 348 | break; | |||
| 349 | } | |||
| 350 | case Intrinsic::sadd_sat: | |||
| 351 | case Intrinsic::ssub_sat: | |||
| 352 | case Intrinsic::uadd_sat: | |||
| 353 | case Intrinsic::usub_sat: { | |||
| 354 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
| 355 | MVT::v8i16, MVT::v2i32, MVT::v4i32, | |||
| 356 | MVT::v2i64}; | |||
| 357 | auto LT = getTypeLegalizationCost(RetTy); | |||
| 358 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we | |||
| 359 | // need to extend the type, as it uses shr(qadd(shl, shl)). | |||
| 360 | unsigned Instrs = | |||
| 361 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; | |||
| 362 | if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) | |||
| 363 | return LT.first * Instrs; | |||
| 364 | break; | |||
| 365 | } | |||
| 366 | case Intrinsic::abs: { | |||
| 367 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, | |||
| 368 | MVT::v8i16, MVT::v2i32, MVT::v4i32, | |||
| 369 | MVT::v2i64}; | |||
| 370 | auto LT = getTypeLegalizationCost(RetTy); | |||
| 371 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) | |||
| 372 | return LT.first; | |||
| 373 | break; | |||
| 374 | } | |||
| 375 | case Intrinsic::experimental_stepvector: { | |||
| 376 | InstructionCost Cost = 1; // Cost of the `index' instruction | |||
| 377 | auto LT = getTypeLegalizationCost(RetTy); | |||
| 378 | // Legalisation of illegal vectors involves an `index' instruction plus | |||
| 379 | // (LT.first - 1) vector adds. | |||
| 380 | if (LT.first > 1) { | |||
| 381 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext()); | |||
| 382 | InstructionCost AddCost = | |||
| 383 | getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind); | |||
| 384 | Cost += AddCost * (LT.first - 1); | |||
| 385 | } | |||
| 386 | return Cost; | |||
| 387 | } | |||
| 388 | case Intrinsic::bitreverse: { | |||
| 389 | static const CostTblEntry BitreverseTbl[] = { | |||
| 390 | {Intrinsic::bitreverse, MVT::i32, 1}, | |||
| 391 | {Intrinsic::bitreverse, MVT::i64, 1}, | |||
| 392 | {Intrinsic::bitreverse, MVT::v8i8, 1}, | |||
| 393 | {Intrinsic::bitreverse, MVT::v16i8, 1}, | |||
| 394 | {Intrinsic::bitreverse, MVT::v4i16, 2}, | |||
| 395 | {Intrinsic::bitreverse, MVT::v8i16, 2}, | |||
| 396 | {Intrinsic::bitreverse, MVT::v2i32, 2}, | |||
| 397 | {Intrinsic::bitreverse, MVT::v4i32, 2}, | |||
| 398 | {Intrinsic::bitreverse, MVT::v1i64, 2}, | |||
| 399 | {Intrinsic::bitreverse, MVT::v2i64, 2}, | |||
| 400 | }; | |||
| 401 | const auto LegalisationCost = getTypeLegalizationCost(RetTy); | |||
| 402 | const auto *Entry = | |||
| 403 | CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); | |||
| 404 | if (Entry) { | |||
| 405 | // Cost Model is using the legal type(i32) that i8 and i16 will be | |||
| 406 | // converted to +1 so that we match the actual lowering cost | |||
| 407 | if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || | |||
| 408 | TLI->getValueType(DL, RetTy, true) == MVT::i16) | |||
| 409 | return LegalisationCost.first * Entry->Cost + 1; | |||
| 410 | ||||
| 411 | return LegalisationCost.first * Entry->Cost; | |||
| 412 | } | |||
| 413 | break; | |||
| 414 | } | |||
| 415 | case Intrinsic::ctpop: { | |||
| 416 | if (!ST->hasNEON()) { | |||
| 417 | // 32-bit or 64-bit ctpop without NEON is 12 instructions. | |||
| 418 | return getTypeLegalizationCost(RetTy).first * 12; | |||
| 419 | } | |||
| 420 | static const CostTblEntry CtpopCostTbl[] = { | |||
| 421 | {ISD::CTPOP, MVT::v2i64, 4}, | |||
| 422 | {ISD::CTPOP, MVT::v4i32, 3}, | |||
| 423 | {ISD::CTPOP, MVT::v8i16, 2}, | |||
| 424 | {ISD::CTPOP, MVT::v16i8, 1}, | |||
| 425 | {ISD::CTPOP, MVT::i64, 4}, | |||
| 426 | {ISD::CTPOP, MVT::v2i32, 3}, | |||
| 427 | {ISD::CTPOP, MVT::v4i16, 2}, | |||
| 428 | {ISD::CTPOP, MVT::v8i8, 1}, | |||
| 429 | {ISD::CTPOP, MVT::i32, 5}, | |||
| 430 | }; | |||
| 431 | auto LT = getTypeLegalizationCost(RetTy); | |||
| 432 | MVT MTy = LT.second; | |||
| 433 | if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { | |||
| 434 | // Extra cost of +1 when illegal vector types are legalized by promoting | |||
| 435 | // the integer type. | |||
| 436 | int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() != | |||
| 437 | RetTy->getScalarSizeInBits() | |||
| 438 | ? 1 | |||
| 439 | : 0; | |||
| 440 | return LT.first * Entry->Cost + ExtraCost; | |||
| 441 | } | |||
| 442 | break; | |||
| 443 | } | |||
| 444 | case Intrinsic::sadd_with_overflow: | |||
| 445 | case Intrinsic::uadd_with_overflow: | |||
| 446 | case Intrinsic::ssub_with_overflow: | |||
| 447 | case Intrinsic::usub_with_overflow: | |||
| 448 | case Intrinsic::smul_with_overflow: | |||
| 449 | case Intrinsic::umul_with_overflow: { | |||
| 450 | static const CostTblEntry WithOverflowCostTbl[] = { | |||
| 451 | {Intrinsic::sadd_with_overflow, MVT::i8, 3}, | |||
| 452 | {Intrinsic::uadd_with_overflow, MVT::i8, 3}, | |||
| 453 | {Intrinsic::sadd_with_overflow, MVT::i16, 3}, | |||
| 454 | {Intrinsic::uadd_with_overflow, MVT::i16, 3}, | |||
| 455 | {Intrinsic::sadd_with_overflow, MVT::i32, 1}, | |||
| 456 | {Intrinsic::uadd_with_overflow, MVT::i32, 1}, | |||
| 457 | {Intrinsic::sadd_with_overflow, MVT::i64, 1}, | |||
| 458 | {Intrinsic::uadd_with_overflow, MVT::i64, 1}, | |||
| 459 | {Intrinsic::ssub_with_overflow, MVT::i8, 3}, | |||
| 460 | {Intrinsic::usub_with_overflow, MVT::i8, 3}, | |||
| 461 | {Intrinsic::ssub_with_overflow, MVT::i16, 3}, | |||
| 462 | {Intrinsic::usub_with_overflow, MVT::i16, 3}, | |||
| 463 | {Intrinsic::ssub_with_overflow, MVT::i32, 1}, | |||
| 464 | {Intrinsic::usub_with_overflow, MVT::i32, 1}, | |||
| 465 | {Intrinsic::ssub_with_overflow, MVT::i64, 1}, | |||
| 466 | {Intrinsic::usub_with_overflow, MVT::i64, 1}, | |||
| 467 | {Intrinsic::smul_with_overflow, MVT::i8, 5}, | |||
| 468 | {Intrinsic::umul_with_overflow, MVT::i8, 4}, | |||
| 469 | {Intrinsic::smul_with_overflow, MVT::i16, 5}, | |||
| 470 | {Intrinsic::umul_with_overflow, MVT::i16, 4}, | |||
| 471 | {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst | |||
| 472 | {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw | |||
| 473 | {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp | |||
| 474 | {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr | |||
| 475 | }; | |||
| 476 | EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true); | |||
| 477 | if (MTy.isSimple()) | |||
| 478 | if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), | |||
| 479 | MTy.getSimpleVT())) | |||
| 480 | return Entry->Cost; | |||
| 481 | break; | |||
| 482 | } | |||
| 483 | case Intrinsic::fptosi_sat: | |||
| 484 | case Intrinsic::fptoui_sat: { | |||
| 485 | if (ICA.getArgTypes().empty()) | |||
| 486 | break; | |||
| 487 | bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; | |||
| 488 | auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]); | |||
| 489 | EVT MTy = TLI->getValueType(DL, RetTy); | |||
| 490 | // Check for the legal types, which are where the size of the input and the | |||
| 491 | // output are the same, or we are using cvt f64->i32 or f32->i64. | |||
| 492 | if ((LT.second == MVT::f32 || LT.second == MVT::f64 || | |||
| 493 | LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || | |||
| 494 | LT.second == MVT::v2f64) && | |||
| 495 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || | |||
| 496 | (LT.second == MVT::f64 && MTy == MVT::i32) || | |||
| 497 | (LT.second == MVT::f32 && MTy == MVT::i64))) | |||
| 498 | return LT.first; | |||
| 499 | // Similarly for fp16 sizes | |||
| 500 | if (ST->hasFullFP16() && | |||
| 501 | ((LT.second == MVT::f16 && MTy == MVT::i32) || | |||
| 502 | ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && | |||
| 503 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) | |||
| 504 | return LT.first; | |||
| 505 | ||||
| 506 | // Otherwise we use a legal convert followed by a min+max | |||
| 507 | if ((LT.second.getScalarType() == MVT::f32 || | |||
| 508 | LT.second.getScalarType() == MVT::f64 || | |||
| 509 | (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && | |||
| 510 | LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { | |||
| 511 | Type *LegalTy = | |||
| 512 | Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits()); | |||
| 513 | if (LT.second.isVector()) | |||
| 514 | LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount()); | |||
| 515 | InstructionCost Cost = 1; | |||
| 516 | IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, | |||
| 517 | LegalTy, {LegalTy, LegalTy}); | |||
| 518 | Cost += getIntrinsicInstrCost(Attrs1, CostKind); | |||
| 519 | IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, | |||
| 520 | LegalTy, {LegalTy, LegalTy}); | |||
| 521 | Cost += getIntrinsicInstrCost(Attrs2, CostKind); | |||
| 522 | return LT.first * Cost; | |||
| 523 | } | |||
| 524 | break; | |||
| 525 | } | |||
| 526 | case Intrinsic::fshl: | |||
| 527 | case Intrinsic::fshr: { | |||
| 528 | if (ICA.getArgs().empty()) | |||
| 529 | break; | |||
| 530 | ||||
| 531 | // TODO: Add handling for fshl where third argument is not a constant. | |||
| 532 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]); | |||
| 533 | if (!OpInfoZ.isConstant()) | |||
| 534 | break; | |||
| 535 | ||||
| 536 | const auto LegalisationCost = getTypeLegalizationCost(RetTy); | |||
| 537 | if (OpInfoZ.isUniform()) { | |||
| 538 | // FIXME: The costs could be lower if the codegen is better. | |||
| 539 | static const CostTblEntry FshlTbl[] = { | |||
| 540 | {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr | |||
| 541 | {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4}, | |||
| 542 | {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3}, | |||
| 543 | {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}}; | |||
| 544 | // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl | |||
| 545 | // to avoid having to duplicate the costs. | |||
| 546 | const auto *Entry = | |||
| 547 | CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second); | |||
| 548 | if (Entry) | |||
| 549 | return LegalisationCost.first * Entry->Cost; | |||
| 550 | } | |||
| 551 | ||||
| 552 | auto TyL = getTypeLegalizationCost(RetTy); | |||
| 553 | if (!RetTy->isIntegerTy()) | |||
| 554 | break; | |||
| 555 | ||||
| 556 | // Estimate cost manually, as types like i8 and i16 will get promoted to | |||
| 557 | // i32 and CostTableLookup will ignore the extra conversion cost. | |||
| 558 | bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && | |||
| 559 | RetTy->getScalarSizeInBits() < 64) || | |||
| 560 | (RetTy->getScalarSizeInBits() % 64 != 0); | |||
| 561 | unsigned ExtraCost = HigherCost ? 1 : 0; | |||
| 562 | if (RetTy->getScalarSizeInBits() == 32 || | |||
| 563 | RetTy->getScalarSizeInBits() == 64) | |||
| 564 | ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single | |||
| 565 | // extr instruction. | |||
| 566 | else if (HigherCost) | |||
| 567 | ExtraCost = 1; | |||
| 568 | else | |||
| 569 | break; | |||
| 570 | return TyL.first + ExtraCost; | |||
| 571 | } | |||
| 572 | default: | |||
| 573 | break; | |||
| 574 | } | |||
| 575 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
| 576 | } | |||
| 577 | ||||
| 578 | /// The function will remove redundant reinterprets casting in the presence | |||
| 579 | /// of the control flow | |||
| 580 | static std::optional<Instruction *> processPhiNode(InstCombiner &IC, | |||
| 581 | IntrinsicInst &II) { | |||
| 582 | SmallVector<Instruction *, 32> Worklist; | |||
| 583 | auto RequiredType = II.getType(); | |||
| 584 | ||||
| 585 | auto *PN = dyn_cast<PHINode>(II.getArgOperand(0)); | |||
| 586 | assert(PN && "Expected Phi Node!")(static_cast <bool> (PN && "Expected Phi Node!" ) ? void (0) : __assert_fail ("PN && \"Expected Phi Node!\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 586 , __extension__ __PRETTY_FUNCTION__)); | |||
| 587 | ||||
| 588 | // Don't create a new Phi unless we can remove the old one. | |||
| 589 | if (!PN->hasOneUse()) | |||
| 590 | return std::nullopt; | |||
| 591 | ||||
| 592 | for (Value *IncValPhi : PN->incoming_values()) { | |||
| 593 | auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi); | |||
| 594 | if (!Reinterpret || | |||
| 595 | Reinterpret->getIntrinsicID() != | |||
| 596 | Intrinsic::aarch64_sve_convert_to_svbool || | |||
| 597 | RequiredType != Reinterpret->getArgOperand(0)->getType()) | |||
| 598 | return std::nullopt; | |||
| 599 | } | |||
| 600 | ||||
| 601 | // Create the new Phi | |||
| 602 | LLVMContext &Ctx = PN->getContext(); | |||
| 603 | IRBuilder<> Builder(Ctx); | |||
| 604 | Builder.SetInsertPoint(PN); | |||
| 605 | PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues()); | |||
| 606 | Worklist.push_back(PN); | |||
| 607 | ||||
| 608 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { | |||
| 609 | auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I)); | |||
| 610 | NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I)); | |||
| 611 | Worklist.push_back(Reinterpret); | |||
| 612 | } | |||
| 613 | ||||
| 614 | // Cleanup Phi Node and reinterprets | |||
| 615 | return IC.replaceInstUsesWith(II, NPN); | |||
| 616 | } | |||
| 617 | ||||
| 618 | // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) | |||
| 619 | // => (binop (pred) (from_svbool _) (from_svbool _)) | |||
| 620 | // | |||
| 621 | // The above transformation eliminates a `to_svbool` in the predicate | |||
| 622 | // operand of bitwise operation `binop` by narrowing the vector width of | |||
| 623 | // the operation. For example, it would convert a `<vscale x 16 x i1> | |||
| 624 | // and` into a `<vscale x 4 x i1> and`. This is profitable because | |||
| 625 | // to_svbool must zero the new lanes during widening, whereas | |||
| 626 | // from_svbool is free. | |||
| 627 | static std::optional<Instruction *> | |||
| 628 | tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { | |||
| 629 | auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0)); | |||
| 630 | if (!BinOp) | |||
| 631 | return std::nullopt; | |||
| 632 | ||||
| 633 | auto IntrinsicID = BinOp->getIntrinsicID(); | |||
| 634 | switch (IntrinsicID) { | |||
| 635 | case Intrinsic::aarch64_sve_and_z: | |||
| 636 | case Intrinsic::aarch64_sve_bic_z: | |||
| 637 | case Intrinsic::aarch64_sve_eor_z: | |||
| 638 | case Intrinsic::aarch64_sve_nand_z: | |||
| 639 | case Intrinsic::aarch64_sve_nor_z: | |||
| 640 | case Intrinsic::aarch64_sve_orn_z: | |||
| 641 | case Intrinsic::aarch64_sve_orr_z: | |||
| 642 | break; | |||
| 643 | default: | |||
| 644 | return std::nullopt; | |||
| 645 | } | |||
| 646 | ||||
| 647 | auto BinOpPred = BinOp->getOperand(0); | |||
| 648 | auto BinOpOp1 = BinOp->getOperand(1); | |||
| 649 | auto BinOpOp2 = BinOp->getOperand(2); | |||
| 650 | ||||
| 651 | auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred); | |||
| 652 | if (!PredIntr || | |||
| 653 | PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) | |||
| 654 | return std::nullopt; | |||
| 655 | ||||
| 656 | auto PredOp = PredIntr->getOperand(0); | |||
| 657 | auto PredOpTy = cast<VectorType>(PredOp->getType()); | |||
| 658 | if (PredOpTy != II.getType()) | |||
| 659 | return std::nullopt; | |||
| 660 | ||||
| 661 | IRBuilder<> Builder(II.getContext()); | |||
| 662 | Builder.SetInsertPoint(&II); | |||
| 663 | ||||
| 664 | SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; | |||
| 665 | auto NarrowBinOpOp1 = Builder.CreateIntrinsic( | |||
| 666 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); | |||
| 667 | NarrowedBinOpArgs.push_back(NarrowBinOpOp1); | |||
| 668 | if (BinOpOp1 == BinOpOp2) | |||
| 669 | NarrowedBinOpArgs.push_back(NarrowBinOpOp1); | |||
| 670 | else | |||
| 671 | NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic( | |||
| 672 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); | |||
| 673 | ||||
| 674 | auto NarrowedBinOp = | |||
| 675 | Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs); | |||
| 676 | return IC.replaceInstUsesWith(II, NarrowedBinOp); | |||
| 677 | } | |||
| 678 | ||||
| 679 | static std::optional<Instruction *> | |||
| 680 | instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { | |||
| 681 | // If the reinterpret instruction operand is a PHI Node | |||
| 682 | if (isa<PHINode>(II.getArgOperand(0))) | |||
| 683 | return processPhiNode(IC, II); | |||
| 684 | ||||
| 685 | if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) | |||
| 686 | return BinOpCombine; | |||
| 687 | ||||
| 688 | SmallVector<Instruction *, 32> CandidatesForRemoval; | |||
| 689 | Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr; | |||
| 690 | ||||
| 691 | const auto *IVTy = cast<VectorType>(II.getType()); | |||
| 692 | ||||
| 693 | // Walk the chain of conversions. | |||
| 694 | while (Cursor) { | |||
| 695 | // If the type of the cursor has fewer lanes than the final result, zeroing | |||
| 696 | // must take place, which breaks the equivalence chain. | |||
| 697 | const auto *CursorVTy = cast<VectorType>(Cursor->getType()); | |||
| 698 | if (CursorVTy->getElementCount().getKnownMinValue() < | |||
| 699 | IVTy->getElementCount().getKnownMinValue()) | |||
| 700 | break; | |||
| 701 | ||||
| 702 | // If the cursor has the same type as I, it is a viable replacement. | |||
| 703 | if (Cursor->getType() == IVTy) | |||
| 704 | EarliestReplacement = Cursor; | |||
| 705 | ||||
| 706 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor); | |||
| 707 | ||||
| 708 | // If this is not an SVE conversion intrinsic, this is the end of the chain. | |||
| 709 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == | |||
| 710 | Intrinsic::aarch64_sve_convert_to_svbool || | |||
| 711 | IntrinsicCursor->getIntrinsicID() == | |||
| 712 | Intrinsic::aarch64_sve_convert_from_svbool)) | |||
| 713 | break; | |||
| 714 | ||||
| 715 | CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor); | |||
| 716 | Cursor = IntrinsicCursor->getOperand(0); | |||
| 717 | } | |||
| 718 | ||||
| 719 | // If no viable replacement in the conversion chain was found, there is | |||
| 720 | // nothing to do. | |||
| 721 | if (!EarliestReplacement) | |||
| 722 | return std::nullopt; | |||
| 723 | ||||
| 724 | return IC.replaceInstUsesWith(II, EarliestReplacement); | |||
| 725 | } | |||
| 726 | ||||
| 727 | static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, | |||
| 728 | IntrinsicInst &II) { | |||
| 729 | IRBuilder<> Builder(&II); | |||
| 730 | auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1), | |||
| 731 | II.getOperand(2)); | |||
| 732 | return IC.replaceInstUsesWith(II, Select); | |||
| 733 | } | |||
| 734 | ||||
| 735 | static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, | |||
| 736 | IntrinsicInst &II) { | |||
| 737 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); | |||
| 738 | if (!Pg) | |||
| 739 | return std::nullopt; | |||
| 740 | ||||
| 741 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
| 742 | return std::nullopt; | |||
| 743 | ||||
| 744 | const auto PTruePattern = | |||
| 745 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); | |||
| 746 | if (PTruePattern != AArch64SVEPredPattern::vl1) | |||
| 747 | return std::nullopt; | |||
| 748 | ||||
| 749 | // The intrinsic is inserting into lane zero so use an insert instead. | |||
| 750 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
| 751 | auto *Insert = InsertElementInst::Create( | |||
| 752 | II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); | |||
| 753 | Insert->insertBefore(&II); | |||
| 754 | Insert->takeName(&II); | |||
| 755 | ||||
| 756 | return IC.replaceInstUsesWith(II, Insert); | |||
| 757 | } | |||
| 758 | ||||
| 759 | static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, | |||
| 760 | IntrinsicInst &II) { | |||
| 761 | // Replace DupX with a regular IR splat. | |||
| 762 | IRBuilder<> Builder(II.getContext()); | |||
| 763 | Builder.SetInsertPoint(&II); | |||
| 764 | auto *RetTy = cast<ScalableVectorType>(II.getType()); | |||
| 765 | Value *Splat = | |||
| 766 | Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0)); | |||
| 767 | Splat->takeName(&II); | |||
| 768 | return IC.replaceInstUsesWith(II, Splat); | |||
| 769 | } | |||
| 770 | ||||
| 771 | static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, | |||
| 772 | IntrinsicInst &II) { | |||
| 773 | LLVMContext &Ctx = II.getContext(); | |||
| 774 | IRBuilder<> Builder(Ctx); | |||
| 775 | Builder.SetInsertPoint(&II); | |||
| 776 | ||||
| 777 | // Check that the predicate is all active | |||
| 778 | auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0)); | |||
| 779 | if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
| 780 | return std::nullopt; | |||
| 781 | ||||
| 782 | const auto PTruePattern = | |||
| 783 | cast<ConstantInt>(Pg->getOperand(0))->getZExtValue(); | |||
| 784 | if (PTruePattern != AArch64SVEPredPattern::all) | |||
| 785 | return std::nullopt; | |||
| 786 | ||||
| 787 | // Check that we have a compare of zero.. | |||
| 788 | auto *SplatValue = | |||
| 789 | dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2))); | |||
| 790 | if (!SplatValue || !SplatValue->isZero()) | |||
| 791 | return std::nullopt; | |||
| 792 | ||||
| 793 | // ..against a dupq | |||
| 794 | auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1)); | |||
| 795 | if (!DupQLane || | |||
| 796 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) | |||
| 797 | return std::nullopt; | |||
| 798 | ||||
| 799 | // Where the dupq is a lane 0 replicate of a vector insert | |||
| 800 | if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero()) | |||
| 801 | return std::nullopt; | |||
| 802 | ||||
| 803 | auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0)); | |||
| 804 | if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) | |||
| 805 | return std::nullopt; | |||
| 806 | ||||
| 807 | // Where the vector insert is a fixed constant vector insert into undef at | |||
| 808 | // index zero | |||
| 809 | if (!isa<UndefValue>(VecIns->getArgOperand(0))) | |||
| 810 | return std::nullopt; | |||
| 811 | ||||
| 812 | if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero()) | |||
| 813 | return std::nullopt; | |||
| 814 | ||||
| 815 | auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1)); | |||
| 816 | if (!ConstVec) | |||
| 817 | return std::nullopt; | |||
| 818 | ||||
| 819 | auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType()); | |||
| 820 | auto *OutTy = dyn_cast<ScalableVectorType>(II.getType()); | |||
| 821 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) | |||
| 822 | return std::nullopt; | |||
| 823 | ||||
| 824 | unsigned NumElts = VecTy->getNumElements(); | |||
| 825 | unsigned PredicateBits = 0; | |||
| 826 | ||||
| 827 | // Expand intrinsic operands to a 16-bit byte level predicate | |||
| 828 | for (unsigned I = 0; I < NumElts; ++I) { | |||
| 829 | auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I)); | |||
| 830 | if (!Arg) | |||
| 831 | return std::nullopt; | |||
| 832 | if (!Arg->isZero()) | |||
| 833 | PredicateBits |= 1 << (I * (16 / NumElts)); | |||
| 834 | } | |||
| 835 | ||||
| 836 | // If all bits are zero bail early with an empty predicate | |||
| 837 | if (PredicateBits == 0) { | |||
| 838 | auto *PFalse = Constant::getNullValue(II.getType()); | |||
| 839 | PFalse->takeName(&II); | |||
| 840 | return IC.replaceInstUsesWith(II, PFalse); | |||
| 841 | } | |||
| 842 | ||||
| 843 | // Calculate largest predicate type used (where byte predicate is largest) | |||
| 844 | unsigned Mask = 8; | |||
| 845 | for (unsigned I = 0; I < 16; ++I) | |||
| 846 | if ((PredicateBits & (1 << I)) != 0) | |||
| 847 | Mask |= (I % 8); | |||
| 848 | ||||
| 849 | unsigned PredSize = Mask & -Mask; | |||
| 850 | auto *PredType = ScalableVectorType::get( | |||
| 851 | Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); | |||
| 852 | ||||
| 853 | // Ensure all relevant bits are set | |||
| 854 | for (unsigned I = 0; I < 16; I += PredSize) | |||
| 855 | if ((PredicateBits & (1 << I)) == 0) | |||
| 856 | return std::nullopt; | |||
| 857 | ||||
| 858 | auto *PTruePat = | |||
| 859 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); | |||
| 860 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, | |||
| 861 | {PredType}, {PTruePat}); | |||
| 862 | auto *ConvertToSVBool = Builder.CreateIntrinsic( | |||
| 863 | Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); | |||
| 864 | auto *ConvertFromSVBool = | |||
| 865 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, | |||
| 866 | {II.getType()}, {ConvertToSVBool}); | |||
| 867 | ||||
| 868 | ConvertFromSVBool->takeName(&II); | |||
| 869 | return IC.replaceInstUsesWith(II, ConvertFromSVBool); | |||
| 870 | } | |||
| 871 | ||||
| 872 | static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, | |||
| 873 | IntrinsicInst &II) { | |||
| 874 | IRBuilder<> Builder(II.getContext()); | |||
| 875 | Builder.SetInsertPoint(&II); | |||
| 876 | Value *Pg = II.getArgOperand(0); | |||
| 877 | Value *Vec = II.getArgOperand(1); | |||
| 878 | auto IntrinsicID = II.getIntrinsicID(); | |||
| 879 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; | |||
| 880 | ||||
| 881 | // lastX(splat(X)) --> X | |||
| 882 | if (auto *SplatVal = getSplatValue(Vec)) | |||
| 883 | return IC.replaceInstUsesWith(II, SplatVal); | |||
| 884 | ||||
| 885 | // If x and/or y is a splat value then: | |||
| 886 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) | |||
| 887 | Value *LHS, *RHS; | |||
| 888 | if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) { | |||
| 889 | if (isSplatValue(LHS) || isSplatValue(RHS)) { | |||
| 890 | auto *OldBinOp = cast<BinaryOperator>(Vec); | |||
| 891 | auto OpC = OldBinOp->getOpcode(); | |||
| 892 | auto *NewLHS = | |||
| 893 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS}); | |||
| 894 | auto *NewRHS = | |||
| 895 | Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS}); | |||
| 896 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( | |||
| 897 | OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II); | |||
| 898 | return IC.replaceInstUsesWith(II, NewBinOp); | |||
| 899 | } | |||
| 900 | } | |||
| 901 | ||||
| 902 | auto *C = dyn_cast<Constant>(Pg); | |||
| 903 | if (IsAfter && C && C->isNullValue()) { | |||
| 904 | // The intrinsic is extracting lane 0 so use an extract instead. | |||
| 905 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
| 906 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); | |||
| 907 | Extract->insertBefore(&II); | |||
| 908 | Extract->takeName(&II); | |||
| 909 | return IC.replaceInstUsesWith(II, Extract); | |||
| 910 | } | |||
| 911 | ||||
| 912 | auto *IntrPG = dyn_cast<IntrinsicInst>(Pg); | |||
| 913 | if (!IntrPG) | |||
| 914 | return std::nullopt; | |||
| 915 | ||||
| 916 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) | |||
| 917 | return std::nullopt; | |||
| 918 | ||||
| 919 | const auto PTruePattern = | |||
| 920 | cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue(); | |||
| 921 | ||||
| 922 | // Can the intrinsic's predicate be converted to a known constant index? | |||
| 923 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern); | |||
| 924 | if (!MinNumElts) | |||
| 925 | return std::nullopt; | |||
| 926 | ||||
| 927 | unsigned Idx = MinNumElts - 1; | |||
| 928 | // Increment the index if extracting the element after the last active | |||
| 929 | // predicate element. | |||
| 930 | if (IsAfter) | |||
| 931 | ++Idx; | |||
| 932 | ||||
| 933 | // Ignore extracts whose index is larger than the known minimum vector | |||
| 934 | // length. NOTE: This is an artificial constraint where we prefer to | |||
| 935 | // maintain what the user asked for until an alternative is proven faster. | |||
| 936 | auto *PgVTy = cast<ScalableVectorType>(Pg->getType()); | |||
| 937 | if (Idx >= PgVTy->getMinNumElements()) | |||
| 938 | return std::nullopt; | |||
| 939 | ||||
| 940 | // The intrinsic is extracting a fixed lane so use an extract instead. | |||
| 941 | auto *IdxTy = Type::getInt64Ty(II.getContext()); | |||
| 942 | auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx)); | |||
| 943 | Extract->insertBefore(&II); | |||
| 944 | Extract->takeName(&II); | |||
| 945 | return IC.replaceInstUsesWith(II, Extract); | |||
| 946 | } | |||
| 947 | ||||
| 948 | static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, | |||
| 949 | IntrinsicInst &II) { | |||
| 950 | // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar | |||
| 951 | // integer variant across a variety of micro-architectures. Replace scalar | |||
| 952 | // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple | |||
| 953 | // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more | |||
| 954 | // depending on the micro-architecture, but has been observed as generally | |||
| 955 | // being faster, particularly when the CLAST[AB] op is a loop-carried | |||
| 956 | // dependency. | |||
| 957 | IRBuilder<> Builder(II.getContext()); | |||
| 958 | Builder.SetInsertPoint(&II); | |||
| 959 | Value *Pg = II.getArgOperand(0); | |||
| 960 | Value *Fallback = II.getArgOperand(1); | |||
| 961 | Value *Vec = II.getArgOperand(2); | |||
| 962 | Type *Ty = II.getType(); | |||
| 963 | ||||
| 964 | if (!Ty->isIntegerTy()) | |||
| 965 | return std::nullopt; | |||
| 966 | ||||
| 967 | Type *FPTy; | |||
| 968 | switch (cast<IntegerType>(Ty)->getBitWidth()) { | |||
| 969 | default: | |||
| 970 | return std::nullopt; | |||
| 971 | case 16: | |||
| 972 | FPTy = Builder.getHalfTy(); | |||
| 973 | break; | |||
| 974 | case 32: | |||
| 975 | FPTy = Builder.getFloatTy(); | |||
| 976 | break; | |||
| 977 | case 64: | |||
| 978 | FPTy = Builder.getDoubleTy(); | |||
| 979 | break; | |||
| 980 | } | |||
| 981 | ||||
| 982 | Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy); | |||
| 983 | auto *FPVTy = VectorType::get( | |||
| 984 | FPTy, cast<VectorType>(Vec->getType())->getElementCount()); | |||
| 985 | Value *FPVec = Builder.CreateBitCast(Vec, FPVTy); | |||
| 986 | auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()}, | |||
| 987 | {Pg, FPFallBack, FPVec}); | |||
| 988 | Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType()); | |||
| 989 | return IC.replaceInstUsesWith(II, FPIItoInt); | |||
| 990 | } | |||
| 991 | ||||
| 992 | static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, | |||
| 993 | IntrinsicInst &II) { | |||
| 994 | LLVMContext &Ctx = II.getContext(); | |||
| 995 | IRBuilder<> Builder(Ctx); | |||
| 996 | Builder.SetInsertPoint(&II); | |||
| 997 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr | |||
| 998 | // can work with RDFFR_PP for ptest elimination. | |||
| 999 | auto *AllPat = | |||
| 1000 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); | |||
| 1001 | auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, | |||
| 1002 | {II.getType()}, {AllPat}); | |||
| 1003 | auto *RDFFR = | |||
| 1004 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); | |||
| 1005 | RDFFR->takeName(&II); | |||
| 1006 | return IC.replaceInstUsesWith(II, RDFFR); | |||
| 1007 | } | |||
| 1008 | ||||
| 1009 | static std::optional<Instruction *> | |||
| 1010 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { | |||
| 1011 | const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue(); | |||
| 1012 | ||||
| 1013 | if (Pattern == AArch64SVEPredPattern::all) { | |||
| 1014 | LLVMContext &Ctx = II.getContext(); | |||
| 1015 | IRBuilder<> Builder(Ctx); | |||
| 1016 | Builder.SetInsertPoint(&II); | |||
| 1017 | ||||
| 1018 | Constant *StepVal = ConstantInt::get(II.getType(), NumElts); | |||
| 1019 | auto *VScale = Builder.CreateVScale(StepVal); | |||
| 1020 | VScale->takeName(&II); | |||
| 1021 | return IC.replaceInstUsesWith(II, VScale); | |||
| 1022 | } | |||
| 1023 | ||||
| 1024 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); | |||
| 1025 | ||||
| 1026 | return MinNumElts && NumElts >= MinNumElts | |||
| 1027 | ? std::optional<Instruction *>(IC.replaceInstUsesWith( | |||
| 1028 | II, ConstantInt::get(II.getType(), MinNumElts))) | |||
| 1029 | : std::nullopt; | |||
| 1030 | } | |||
| 1031 | ||||
| 1032 | static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, | |||
| 1033 | IntrinsicInst &II) { | |||
| 1034 | Value *PgVal = II.getArgOperand(0); | |||
| 1035 | Value *OpVal = II.getArgOperand(1); | |||
| 1036 | ||||
| 1037 | IRBuilder<> Builder(II.getContext()); | |||
| 1038 | Builder.SetInsertPoint(&II); | |||
| 1039 | ||||
| 1040 | // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). | |||
| 1041 | // Later optimizations prefer this form. | |||
| 1042 | if (PgVal == OpVal && | |||
| 1043 | (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || | |||
| 1044 | II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { | |||
| 1045 | Value *Ops[] = {PgVal, OpVal}; | |||
| 1046 | Type *Tys[] = {PgVal->getType()}; | |||
| 1047 | ||||
| 1048 | auto *PTest = | |||
| 1049 | Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); | |||
| 1050 | PTest->takeName(&II); | |||
| 1051 | ||||
| 1052 | return IC.replaceInstUsesWith(II, PTest); | |||
| 1053 | } | |||
| 1054 | ||||
| 1055 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal); | |||
| 1056 | IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal); | |||
| 1057 | ||||
| 1058 | if (!Pg || !Op) | |||
| 1059 | return std::nullopt; | |||
| 1060 | ||||
| 1061 | Intrinsic::ID OpIID = Op->getIntrinsicID(); | |||
| 1062 | ||||
| 1063 | if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && | |||
| 1064 | OpIID == Intrinsic::aarch64_sve_convert_to_svbool && | |||
| 1065 | Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { | |||
| 1066 | Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)}; | |||
| 1067 | Type *Tys[] = {Pg->getArgOperand(0)->getType()}; | |||
| 1068 | ||||
| 1069 | auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); | |||
| 1070 | ||||
| 1071 | PTest->takeName(&II); | |||
| 1072 | return IC.replaceInstUsesWith(II, PTest); | |||
| 1073 | } | |||
| 1074 | ||||
| 1075 | // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). | |||
| 1076 | // Later optimizations may rewrite sequence to use the flag-setting variant | |||
| 1077 | // of instruction X to remove PTEST. | |||
| 1078 | if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && | |||
| 1079 | ((OpIID == Intrinsic::aarch64_sve_brka_z) || | |||
| 1080 | (OpIID == Intrinsic::aarch64_sve_brkb_z) || | |||
| 1081 | (OpIID == Intrinsic::aarch64_sve_brkpa_z) || | |||
| 1082 | (OpIID == Intrinsic::aarch64_sve_brkpb_z) || | |||
| 1083 | (OpIID == Intrinsic::aarch64_sve_rdffr_z) || | |||
| 1084 | (OpIID == Intrinsic::aarch64_sve_and_z) || | |||
| 1085 | (OpIID == Intrinsic::aarch64_sve_bic_z) || | |||
| 1086 | (OpIID == Intrinsic::aarch64_sve_eor_z) || | |||
| 1087 | (OpIID == Intrinsic::aarch64_sve_nand_z) || | |||
| 1088 | (OpIID == Intrinsic::aarch64_sve_nor_z) || | |||
| 1089 | (OpIID == Intrinsic::aarch64_sve_orn_z) || | |||
| 1090 | (OpIID == Intrinsic::aarch64_sve_orr_z))) { | |||
| 1091 | Value *Ops[] = {Pg->getArgOperand(0), Pg}; | |||
| 1092 | Type *Tys[] = {Pg->getType()}; | |||
| 1093 | ||||
| 1094 | auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops); | |||
| 1095 | PTest->takeName(&II); | |||
| 1096 | ||||
| 1097 | return IC.replaceInstUsesWith(II, PTest); | |||
| 1098 | } | |||
| 1099 | ||||
| 1100 | return std::nullopt; | |||
| 1101 | } | |||
| 1102 | ||||
| 1103 | template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> | |||
| 1104 | static std::optional<Instruction *> | |||
| 1105 | instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, | |||
| 1106 | bool MergeIntoAddendOp) { | |||
| 1107 | Value *P = II.getOperand(0); | |||
| 1108 | Value *MulOp0, *MulOp1, *AddendOp, *Mul; | |||
| 1109 | if (MergeIntoAddendOp) { | |||
| 1110 | AddendOp = II.getOperand(1); | |||
| 1111 | Mul = II.getOperand(2); | |||
| 1112 | } else { | |||
| 1113 | AddendOp = II.getOperand(2); | |||
| 1114 | Mul = II.getOperand(1); | |||
| 1115 | } | |||
| 1116 | ||||
| 1117 | if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0), | |||
| 1118 | m_Value(MulOp1)))) | |||
| 1119 | return std::nullopt; | |||
| 1120 | ||||
| 1121 | if (!Mul->hasOneUse()) | |||
| 1122 | return std::nullopt; | |||
| 1123 | ||||
| 1124 | Instruction *FMFSource = nullptr; | |||
| 1125 | if (II.getType()->isFPOrFPVectorTy()) { | |||
| 1126 | llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); | |||
| 1127 | // Stop the combine when the flags on the inputs differ in case dropping | |||
| 1128 | // flags would lead to us missing out on more beneficial optimizations. | |||
| 1129 | if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags()) | |||
| 1130 | return std::nullopt; | |||
| 1131 | if (!FAddFlags.allowContract()) | |||
| 1132 | return std::nullopt; | |||
| 1133 | FMFSource = &II; | |||
| 1134 | } | |||
| 1135 | ||||
| 1136 | IRBuilder<> Builder(II.getContext()); | |||
| 1137 | Builder.SetInsertPoint(&II); | |||
| 1138 | ||||
| 1139 | CallInst *Res; | |||
| 1140 | if (MergeIntoAddendOp) | |||
| 1141 | Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()}, | |||
| 1142 | {P, AddendOp, MulOp0, MulOp1}, FMFSource); | |||
| 1143 | else | |||
| 1144 | Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()}, | |||
| 1145 | {P, MulOp0, MulOp1, AddendOp}, FMFSource); | |||
| 1146 | ||||
| 1147 | return IC.replaceInstUsesWith(II, Res); | |||
| 1148 | } | |||
| 1149 | ||||
| 1150 | static bool isAllActivePredicate(Value *Pred) { | |||
| 1151 | // Look through convert.from.svbool(convert.to.svbool(...) chain. | |||
| 1152 | Value *UncastedPred; | |||
| 1153 | if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( | |||
| 1154 | m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( | |||
| 1155 | m_Value(UncastedPred))))) | |||
| 1156 | // If the predicate has the same or less lanes than the uncasted | |||
| 1157 | // predicate then we know the casting has no effect. | |||
| 1158 | if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= | |||
| 1159 | cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) | |||
| 1160 | Pred = UncastedPred; | |||
| 1161 | ||||
| 1162 | return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( | |||
| 1163 | m_ConstantInt<AArch64SVEPredPattern::all>())); | |||
| 1164 | } | |||
| 1165 | ||||
| 1166 | static std::optional<Instruction *> | |||
| 1167 | instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { | |||
| 1168 | IRBuilder<> Builder(II.getContext()); | |||
| 1169 | Builder.SetInsertPoint(&II); | |||
| 1170 | ||||
| 1171 | Value *Pred = II.getOperand(0); | |||
| 1172 | Value *PtrOp = II.getOperand(1); | |||
| 1173 | Type *VecTy = II.getType(); | |||
| 1174 | Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); | |||
| 1175 | ||||
| 1176 | if (isAllActivePredicate(Pred)) { | |||
| 1177 | LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); | |||
| 1178 | Load->copyMetadata(II); | |||
| 1179 | return IC.replaceInstUsesWith(II, Load); | |||
| 1180 | } | |||
| 1181 | ||||
| 1182 | CallInst *MaskedLoad = | |||
| 1183 | Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL), | |||
| 1184 | Pred, ConstantAggregateZero::get(VecTy)); | |||
| 1185 | MaskedLoad->copyMetadata(II); | |||
| 1186 | return IC.replaceInstUsesWith(II, MaskedLoad); | |||
| 1187 | } | |||
| 1188 | ||||
| 1189 | static std::optional<Instruction *> | |||
| 1190 | instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { | |||
| 1191 | IRBuilder<> Builder(II.getContext()); | |||
| 1192 | Builder.SetInsertPoint(&II); | |||
| 1193 | ||||
| 1194 | Value *VecOp = II.getOperand(0); | |||
| 1195 | Value *Pred = II.getOperand(1); | |||
| 1196 | Value *PtrOp = II.getOperand(2); | |||
| 1197 | Value *VecPtr = | |||
| 1198 | Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); | |||
| 1199 | ||||
| 1200 | if (isAllActivePredicate(Pred)) { | |||
| 1201 | StoreInst *Store = Builder.CreateStore(VecOp, VecPtr); | |||
| 1202 | Store->copyMetadata(II); | |||
| 1203 | return IC.eraseInstFromFunction(II); | |||
| 1204 | } | |||
| 1205 | ||||
| 1206 | CallInst *MaskedStore = Builder.CreateMaskedStore( | |||
| 1207 | VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred); | |||
| 1208 | MaskedStore->copyMetadata(II); | |||
| 1209 | return IC.eraseInstFromFunction(II); | |||
| 1210 | } | |||
| 1211 | ||||
| 1212 | static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { | |||
| 1213 | switch (Intrinsic) { | |||
| 1214 | case Intrinsic::aarch64_sve_fmul: | |||
| 1215 | return Instruction::BinaryOps::FMul; | |||
| 1216 | case Intrinsic::aarch64_sve_fadd: | |||
| 1217 | return Instruction::BinaryOps::FAdd; | |||
| 1218 | case Intrinsic::aarch64_sve_fsub: | |||
| 1219 | return Instruction::BinaryOps::FSub; | |||
| 1220 | default: | |||
| 1221 | return Instruction::BinaryOpsEnd; | |||
| 1222 | } | |||
| 1223 | } | |||
| 1224 | ||||
| 1225 | static std::optional<Instruction *> | |||
| 1226 | instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { | |||
| 1227 | // Bail due to missing support for ISD::STRICT_ scalable vector operations. | |||
| 1228 | if (II.isStrictFP()) | |||
| 1229 | return std::nullopt; | |||
| 1230 | ||||
| 1231 | auto *OpPredicate = II.getOperand(0); | |||
| 1232 | auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID()); | |||
| 1233 | if (BinOpCode == Instruction::BinaryOpsEnd || | |||
| 1234 | !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( | |||
| 1235 | m_ConstantInt<AArch64SVEPredPattern::all>()))) | |||
| 1236 | return std::nullopt; | |||
| 1237 | IRBuilder<> Builder(II.getContext()); | |||
| 1238 | Builder.SetInsertPoint(&II); | |||
| 1239 | Builder.setFastMathFlags(II.getFastMathFlags()); | |||
| 1240 | auto BinOp = | |||
| 1241 | Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2)); | |||
| 1242 | return IC.replaceInstUsesWith(II, BinOp); | |||
| 1243 | } | |||
| 1244 | ||||
| 1245 | static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, | |||
| 1246 | IntrinsicInst &II) { | |||
| 1247 | if (auto FMLA = | |||
| 1248 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
| 1249 | Intrinsic::aarch64_sve_fmla>(IC, II, | |||
| 1250 | true)) | |||
| 1251 | return FMLA; | |||
| 1252 | if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, | |||
| 1253 | Intrinsic::aarch64_sve_mla>( | |||
| 1254 | IC, II, true)) | |||
| 1255 | return MLA; | |||
| 1256 | if (auto FMAD = | |||
| 1257 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
| 1258 | Intrinsic::aarch64_sve_fmad>(IC, II, | |||
| 1259 | false)) | |||
| 1260 | return FMAD; | |||
| 1261 | if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, | |||
| 1262 | Intrinsic::aarch64_sve_mad>( | |||
| 1263 | IC, II, false)) | |||
| 1264 | return MAD; | |||
| 1265 | return instCombineSVEVectorBinOp(IC, II); | |||
| 1266 | } | |||
| 1267 | ||||
| 1268 | static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, | |||
| 1269 | IntrinsicInst &II) { | |||
| 1270 | if (auto FMLS = | |||
| 1271 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
| 1272 | Intrinsic::aarch64_sve_fmls>(IC, II, | |||
| 1273 | true)) | |||
| 1274 | return FMLS; | |||
| 1275 | if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, | |||
| 1276 | Intrinsic::aarch64_sve_mls>( | |||
| 1277 | IC, II, true)) | |||
| 1278 | return MLS; | |||
| 1279 | if (auto FMSB = | |||
| 1280 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, | |||
| 1281 | Intrinsic::aarch64_sve_fnmsb>( | |||
| 1282 | IC, II, false)) | |||
| 1283 | return FMSB; | |||
| 1284 | return instCombineSVEVectorBinOp(IC, II); | |||
| 1285 | } | |||
| 1286 | ||||
| 1287 | static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, | |||
| 1288 | IntrinsicInst &II) { | |||
| 1289 | auto *OpPredicate = II.getOperand(0); | |||
| 1290 | auto *OpMultiplicand = II.getOperand(1); | |||
| 1291 | auto *OpMultiplier = II.getOperand(2); | |||
| 1292 | ||||
| 1293 | IRBuilder<> Builder(II.getContext()); | |||
| 1294 | Builder.SetInsertPoint(&II); | |||
| 1295 | ||||
| 1296 | // Return true if a given instruction is a unit splat value, false otherwise. | |||
| 1297 | auto IsUnitSplat = [](auto *I) { | |||
| 1298 | auto *SplatValue = getSplatValue(I); | |||
| 1299 | if (!SplatValue) | |||
| 1300 | return false; | |||
| 1301 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); | |||
| 1302 | }; | |||
| 1303 | ||||
| 1304 | // Return true if a given instruction is an aarch64_sve_dup intrinsic call | |||
| 1305 | // with a unit splat value, false otherwise. | |||
| 1306 | auto IsUnitDup = [](auto *I) { | |||
| 1307 | auto *IntrI = dyn_cast<IntrinsicInst>(I); | |||
| 1308 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) | |||
| 1309 | return false; | |||
| 1310 | ||||
| 1311 | auto *SplatValue = IntrI->getOperand(2); | |||
| 1312 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); | |||
| 1313 | }; | |||
| 1314 | ||||
| 1315 | if (IsUnitSplat(OpMultiplier)) { | |||
| 1316 | // [f]mul pg %n, (dupx 1) => %n | |||
| 1317 | OpMultiplicand->takeName(&II); | |||
| 1318 | return IC.replaceInstUsesWith(II, OpMultiplicand); | |||
| 1319 | } else if (IsUnitDup(OpMultiplier)) { | |||
| 1320 | // [f]mul pg %n, (dup pg 1) => %n | |||
| 1321 | auto *DupInst = cast<IntrinsicInst>(OpMultiplier); | |||
| 1322 | auto *DupPg = DupInst->getOperand(1); | |||
| 1323 | // TODO: this is naive. The optimization is still valid if DupPg | |||
| 1324 | // 'encompasses' OpPredicate, not only if they're the same predicate. | |||
| 1325 | if (OpPredicate == DupPg) { | |||
| 1326 | OpMultiplicand->takeName(&II); | |||
| 1327 | return IC.replaceInstUsesWith(II, OpMultiplicand); | |||
| 1328 | } | |||
| 1329 | } | |||
| 1330 | ||||
| 1331 | return instCombineSVEVectorBinOp(IC, II); | |||
| 1332 | } | |||
| 1333 | ||||
| 1334 | static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, | |||
| 1335 | IntrinsicInst &II) { | |||
| 1336 | IRBuilder<> Builder(II.getContext()); | |||
| 1337 | Builder.SetInsertPoint(&II); | |||
| 1338 | Value *UnpackArg = II.getArgOperand(0); | |||
| 1339 | auto *RetTy = cast<ScalableVectorType>(II.getType()); | |||
| 1340 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || | |||
| 1341 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; | |||
| 1342 | ||||
| 1343 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) | |||
| 1344 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) | |||
| 1345 | if (auto *ScalarArg = getSplatValue(UnpackArg)) { | |||
| 1346 | ScalarArg = | |||
| 1347 | Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned); | |||
| 1348 | Value *NewVal = | |||
| 1349 | Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg); | |||
| 1350 | NewVal->takeName(&II); | |||
| 1351 | return IC.replaceInstUsesWith(II, NewVal); | |||
| 1352 | } | |||
| 1353 | ||||
| 1354 | return std::nullopt; | |||
| 1355 | } | |||
| 1356 | static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, | |||
| 1357 | IntrinsicInst &II) { | |||
| 1358 | auto *OpVal = II.getOperand(0); | |||
| 1359 | auto *OpIndices = II.getOperand(1); | |||
| 1360 | VectorType *VTy = cast<VectorType>(II.getType()); | |||
| 1361 | ||||
| 1362 | // Check whether OpIndices is a constant splat value < minimal element count | |||
| 1363 | // of result. | |||
| 1364 | auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices)); | |||
| 1365 | if (!SplatValue || | |||
| 1366 | SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue())) | |||
| 1367 | return std::nullopt; | |||
| 1368 | ||||
| 1369 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to | |||
| 1370 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. | |||
| 1371 | IRBuilder<> Builder(II.getContext()); | |||
| 1372 | Builder.SetInsertPoint(&II); | |||
| 1373 | auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue); | |||
| 1374 | auto *VectorSplat = | |||
| 1375 | Builder.CreateVectorSplat(VTy->getElementCount(), Extract); | |||
| 1376 | ||||
| 1377 | VectorSplat->takeName(&II); | |||
| 1378 | return IC.replaceInstUsesWith(II, VectorSplat); | |||
| 1379 | } | |||
| 1380 | ||||
| 1381 | static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, | |||
| 1382 | IntrinsicInst &II) { | |||
| 1383 | // zip1(uzp1(A, B), uzp2(A, B)) --> A | |||
| 1384 | // zip2(uzp1(A, B), uzp2(A, B)) --> B | |||
| 1385 | Value *A, *B; | |||
| 1386 | if (match(II.getArgOperand(0), | |||
| 1387 | m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && | |||
| 1388 | match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( | |||
| 1389 | m_Specific(A), m_Specific(B)))) | |||
| 1390 | return IC.replaceInstUsesWith( | |||
| 1391 | II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); | |||
| 1392 | ||||
| 1393 | return std::nullopt; | |||
| 1394 | } | |||
| 1395 | ||||
| 1396 | static std::optional<Instruction *> | |||
| 1397 | instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { | |||
| 1398 | Value *Mask = II.getOperand(0); | |||
| 1399 | Value *BasePtr = II.getOperand(1); | |||
| 1400 | Value *Index = II.getOperand(2); | |||
| 1401 | Type *Ty = II.getType(); | |||
| 1402 | Value *PassThru = ConstantAggregateZero::get(Ty); | |||
| 1403 | ||||
| 1404 | // Contiguous gather => masked load. | |||
| 1405 | // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) | |||
| 1406 | // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) | |||
| 1407 | Value *IndexBase; | |||
| 1408 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( | |||
| 1409 | m_Value(IndexBase), m_SpecificInt(1)))) { | |||
| 1410 | IRBuilder<> Builder(II.getContext()); | |||
| 1411 | Builder.SetInsertPoint(&II); | |||
| 1412 | ||||
| 1413 | Align Alignment = | |||
| 1414 | BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); | |||
| 1415 | ||||
| 1416 | Type *VecPtrTy = PointerType::getUnqual(Ty); | |||
| 1417 | Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), | |||
| 1418 | BasePtr, IndexBase); | |||
| 1419 | Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); | |||
| 1420 | CallInst *MaskedLoad = | |||
| 1421 | Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); | |||
| 1422 | MaskedLoad->takeName(&II); | |||
| 1423 | return IC.replaceInstUsesWith(II, MaskedLoad); | |||
| 1424 | } | |||
| 1425 | ||||
| 1426 | return std::nullopt; | |||
| 1427 | } | |||
| 1428 | ||||
| 1429 | static std::optional<Instruction *> | |||
| 1430 | instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { | |||
| 1431 | Value *Val = II.getOperand(0); | |||
| 1432 | Value *Mask = II.getOperand(1); | |||
| 1433 | Value *BasePtr = II.getOperand(2); | |||
| 1434 | Value *Index = II.getOperand(3); | |||
| 1435 | Type *Ty = Val->getType(); | |||
| 1436 | ||||
| 1437 | // Contiguous scatter => masked store. | |||
| 1438 | // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) | |||
| 1439 | // => (masked.store Value (gep BasePtr IndexBase) Align Mask) | |||
| 1440 | Value *IndexBase; | |||
| 1441 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( | |||
| 1442 | m_Value(IndexBase), m_SpecificInt(1)))) { | |||
| 1443 | IRBuilder<> Builder(II.getContext()); | |||
| 1444 | Builder.SetInsertPoint(&II); | |||
| 1445 | ||||
| 1446 | Align Alignment = | |||
| 1447 | BasePtr->getPointerAlignment(II.getModule()->getDataLayout()); | |||
| 1448 | ||||
| 1449 | Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(), | |||
| 1450 | BasePtr, IndexBase); | |||
| 1451 | Type *VecPtrTy = PointerType::getUnqual(Ty); | |||
| 1452 | Ptr = Builder.CreateBitCast(Ptr, VecPtrTy); | |||
| 1453 | ||||
| 1454 | (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); | |||
| 1455 | ||||
| 1456 | return IC.eraseInstFromFunction(II); | |||
| 1457 | } | |||
| 1458 | ||||
| 1459 | return std::nullopt; | |||
| 1460 | } | |||
| 1461 | ||||
| 1462 | static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, | |||
| 1463 | IntrinsicInst &II) { | |||
| 1464 | IRBuilder<> Builder(II.getContext()); | |||
| 1465 | Builder.SetInsertPoint(&II); | |||
| 1466 | Type *Int32Ty = Builder.getInt32Ty(); | |||
| 1467 | Value *Pred = II.getOperand(0); | |||
| 1468 | Value *Vec = II.getOperand(1); | |||
| 1469 | Value *DivVec = II.getOperand(2); | |||
| 1470 | ||||
| 1471 | Value *SplatValue = getSplatValue(DivVec); | |||
| 1472 | ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); | |||
| 1473 | if (!SplatConstantInt) | |||
| 1474 | return std::nullopt; | |||
| 1475 | APInt Divisor = SplatConstantInt->getValue(); | |||
| 1476 | ||||
| 1477 | if (Divisor.isPowerOf2()) { | |||
| 1478 | Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); | |||
| 1479 | auto ASRD = Builder.CreateIntrinsic( | |||
| 1480 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); | |||
| 1481 | return IC.replaceInstUsesWith(II, ASRD); | |||
| 1482 | } | |||
| 1483 | if (Divisor.isNegatedPowerOf2()) { | |||
| 1484 | Divisor.negate(); | |||
| 1485 | Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); | |||
| 1486 | auto ASRD = Builder.CreateIntrinsic( | |||
| 1487 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); | |||
| 1488 | auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, | |||
| 1489 | {ASRD->getType()}, {ASRD, Pred, ASRD}); | |||
| 1490 | return IC.replaceInstUsesWith(II, NEG); | |||
| 1491 | } | |||
| 1492 | ||||
| 1493 | return std::nullopt; | |||
| 1494 | } | |||
| 1495 | ||||
| 1496 | bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { | |||
| 1497 | size_t VecSize = Vec.size(); | |||
| 1498 | if (VecSize == 1) | |||
| 1499 | return true; | |||
| 1500 | if (!isPowerOf2_64(VecSize)) | |||
| 1501 | return false; | |||
| 1502 | size_t HalfVecSize = VecSize / 2; | |||
| 1503 | ||||
| 1504 | for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; | |||
| 1505 | RHS != Vec.end(); LHS++, RHS++) { | |||
| 1506 | if (*LHS != nullptr && *RHS != nullptr) { | |||
| 1507 | if (*LHS == *RHS) | |||
| 1508 | continue; | |||
| 1509 | else | |||
| 1510 | return false; | |||
| 1511 | } | |||
| 1512 | if (!AllowPoison) | |||
| 1513 | return false; | |||
| 1514 | if (*LHS == nullptr && *RHS != nullptr) | |||
| 1515 | *LHS = *RHS; | |||
| 1516 | } | |||
| 1517 | ||||
| 1518 | Vec.resize(HalfVecSize); | |||
| 1519 | SimplifyValuePattern(Vec, AllowPoison); | |||
| 1520 | return true; | |||
| 1521 | } | |||
| 1522 | ||||
| 1523 | // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) | |||
| 1524 | // to dupqlane(f64(C)) where C is A concatenated with B | |||
| 1525 | static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, | |||
| 1526 | IntrinsicInst &II) { | |||
| 1527 | Value *CurrentInsertElt = nullptr, *Default = nullptr; | |||
| 1528 | if (!match(II.getOperand(0), | |||
| 1529 | m_Intrinsic<Intrinsic::vector_insert>( | |||
| 1530 | m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || | |||
| 1531 | !isa<FixedVectorType>(CurrentInsertElt->getType())) | |||
| 1532 | return std::nullopt; | |||
| 1533 | auto IIScalableTy = cast<ScalableVectorType>(II.getType()); | |||
| 1534 | ||||
| 1535 | // Insert the scalars into a container ordered by InsertElement index | |||
| 1536 | SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); | |||
| 1537 | while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) { | |||
| 1538 | auto Idx = cast<ConstantInt>(InsertElt->getOperand(2)); | |||
| 1539 | Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1); | |||
| 1540 | CurrentInsertElt = InsertElt->getOperand(0); | |||
| 1541 | } | |||
| 1542 | ||||
| 1543 | bool AllowPoison = | |||
| 1544 | isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default); | |||
| 1545 | if (!SimplifyValuePattern(Elts, AllowPoison)) | |||
| 1546 | return std::nullopt; | |||
| 1547 | ||||
| 1548 | // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) | |||
| 1549 | IRBuilder<> Builder(II.getContext()); | |||
| 1550 | Builder.SetInsertPoint(&II); | |||
| 1551 | Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType()); | |||
| 1552 | for (size_t I = 0; I < Elts.size(); I++) { | |||
| 1553 | if (Elts[I] == nullptr) | |||
| 1554 | continue; | |||
| 1555 | InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I], | |||
| 1556 | Builder.getInt64(I)); | |||
| 1557 | } | |||
| 1558 | if (InsertEltChain == nullptr) | |||
| 1559 | return std::nullopt; | |||
| 1560 | ||||
| 1561 | // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 | |||
| 1562 | // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector | |||
| 1563 | // be bitcast to a type wide enough to fit the sequence, be splatted, and then | |||
| 1564 | // be narrowed back to the original type. | |||
| 1565 | unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); | |||
| 1566 | unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * | |||
| 1567 | IIScalableTy->getMinNumElements() / | |||
| 1568 | PatternWidth; | |||
| 1569 | ||||
| 1570 | IntegerType *WideTy = Builder.getIntNTy(PatternWidth); | |||
| 1571 | auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount); | |||
| 1572 | auto *WideShuffleMaskTy = | |||
| 1573 | ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount); | |||
| 1574 | ||||
| 1575 | auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0)); | |||
| 1576 | auto InsertSubvector = Builder.CreateInsertVector( | |||
| 1577 | II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx); | |||
| 1578 | auto WideBitcast = | |||
| 1579 | Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy); | |||
| 1580 | auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy); | |||
| 1581 | auto WideShuffle = Builder.CreateShuffleVector( | |||
| 1582 | WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask); | |||
| 1583 | auto NarrowBitcast = | |||
| 1584 | Builder.CreateBitOrPointerCast(WideShuffle, II.getType()); | |||
| 1585 | ||||
| 1586 | return IC.replaceInstUsesWith(II, NarrowBitcast); | |||
| 1587 | } | |||
| 1588 | ||||
| 1589 | static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, | |||
| 1590 | IntrinsicInst &II) { | |||
| 1591 | Value *A = II.getArgOperand(0); | |||
| 1592 | Value *B = II.getArgOperand(1); | |||
| 1593 | if (A == B) | |||
| 1594 | return IC.replaceInstUsesWith(II, A); | |||
| 1595 | ||||
| 1596 | return std::nullopt; | |||
| 1597 | } | |||
| 1598 | ||||
| 1599 | static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, | |||
| 1600 | IntrinsicInst &II) { | |||
| 1601 | IRBuilder<> Builder(&II); | |||
| 1602 | Value *Pred = II.getOperand(0); | |||
| 1603 | Value *Vec = II.getOperand(1); | |||
| 1604 | Value *Shift = II.getOperand(2); | |||
| 1605 | ||||
| 1606 | // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. | |||
| 1607 | Value *AbsPred, *MergedValue; | |||
| 1608 | if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( | |||
| 1609 | m_Value(MergedValue), m_Value(AbsPred), m_Value())) && | |||
| 1610 | !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( | |||
| 1611 | m_Value(MergedValue), m_Value(AbsPred), m_Value()))) | |||
| 1612 | ||||
| 1613 | return std::nullopt; | |||
| 1614 | ||||
| 1615 | // Transform is valid if any of the following are true: | |||
| 1616 | // * The ABS merge value is an undef or non-negative | |||
| 1617 | // * The ABS predicate is all active | |||
| 1618 | // * The ABS predicate and the SRSHL predicates are the same | |||
| 1619 | if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) && | |||
| 1620 | AbsPred != Pred && !isAllActivePredicate(AbsPred)) | |||
| 1621 | return std::nullopt; | |||
| 1622 | ||||
| 1623 | // Only valid when the shift amount is non-negative, otherwise the rounding | |||
| 1624 | // behaviour of SRSHL cannot be ignored. | |||
| 1625 | if (!match(Shift, m_NonNegative())) | |||
| 1626 | return std::nullopt; | |||
| 1627 | ||||
| 1628 | auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()}, | |||
| 1629 | {Pred, Vec, Shift}); | |||
| 1630 | ||||
| 1631 | return IC.replaceInstUsesWith(II, LSL); | |||
| 1632 | } | |||
| 1633 | ||||
| 1634 | std::optional<Instruction *> | |||
| 1635 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, | |||
| 1636 | IntrinsicInst &II) const { | |||
| 1637 | Intrinsic::ID IID = II.getIntrinsicID(); | |||
| 1638 | switch (IID) { | |||
| 1639 | default: | |||
| 1640 | break; | |||
| 1641 | case Intrinsic::aarch64_neon_fmaxnm: | |||
| 1642 | case Intrinsic::aarch64_neon_fminnm: | |||
| 1643 | return instCombineMaxMinNM(IC, II); | |||
| 1644 | case Intrinsic::aarch64_sve_convert_from_svbool: | |||
| 1645 | return instCombineConvertFromSVBool(IC, II); | |||
| 1646 | case Intrinsic::aarch64_sve_dup: | |||
| 1647 | return instCombineSVEDup(IC, II); | |||
| 1648 | case Intrinsic::aarch64_sve_dup_x: | |||
| 1649 | return instCombineSVEDupX(IC, II); | |||
| 1650 | case Intrinsic::aarch64_sve_cmpne: | |||
| 1651 | case Intrinsic::aarch64_sve_cmpne_wide: | |||
| 1652 | return instCombineSVECmpNE(IC, II); | |||
| 1653 | case Intrinsic::aarch64_sve_rdffr: | |||
| 1654 | return instCombineRDFFR(IC, II); | |||
| 1655 | case Intrinsic::aarch64_sve_lasta: | |||
| 1656 | case Intrinsic::aarch64_sve_lastb: | |||
| 1657 | return instCombineSVELast(IC, II); | |||
| 1658 | case Intrinsic::aarch64_sve_clasta_n: | |||
| 1659 | case Intrinsic::aarch64_sve_clastb_n: | |||
| 1660 | return instCombineSVECondLast(IC, II); | |||
| 1661 | case Intrinsic::aarch64_sve_cntd: | |||
| 1662 | return instCombineSVECntElts(IC, II, 2); | |||
| 1663 | case Intrinsic::aarch64_sve_cntw: | |||
| 1664 | return instCombineSVECntElts(IC, II, 4); | |||
| 1665 | case Intrinsic::aarch64_sve_cnth: | |||
| 1666 | return instCombineSVECntElts(IC, II, 8); | |||
| 1667 | case Intrinsic::aarch64_sve_cntb: | |||
| 1668 | return instCombineSVECntElts(IC, II, 16); | |||
| 1669 | case Intrinsic::aarch64_sve_ptest_any: | |||
| 1670 | case Intrinsic::aarch64_sve_ptest_first: | |||
| 1671 | case Intrinsic::aarch64_sve_ptest_last: | |||
| 1672 | return instCombineSVEPTest(IC, II); | |||
| 1673 | case Intrinsic::aarch64_sve_mul: | |||
| 1674 | case Intrinsic::aarch64_sve_fmul: | |||
| 1675 | return instCombineSVEVectorMul(IC, II); | |||
| 1676 | case Intrinsic::aarch64_sve_fadd: | |||
| 1677 | case Intrinsic::aarch64_sve_add: | |||
| 1678 | return instCombineSVEVectorAdd(IC, II); | |||
| 1679 | case Intrinsic::aarch64_sve_fadd_u: | |||
| 1680 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, | |||
| 1681 | Intrinsic::aarch64_sve_fmla_u>( | |||
| 1682 | IC, II, true); | |||
| 1683 | case Intrinsic::aarch64_sve_fsub: | |||
| 1684 | case Intrinsic::aarch64_sve_sub: | |||
| 1685 | return instCombineSVEVectorSub(IC, II); | |||
| 1686 | case Intrinsic::aarch64_sve_fsub_u: | |||
| 1687 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, | |||
| 1688 | Intrinsic::aarch64_sve_fmls_u>( | |||
| 1689 | IC, II, true); | |||
| 1690 | case Intrinsic::aarch64_sve_tbl: | |||
| 1691 | return instCombineSVETBL(IC, II); | |||
| 1692 | case Intrinsic::aarch64_sve_uunpkhi: | |||
| 1693 | case Intrinsic::aarch64_sve_uunpklo: | |||
| 1694 | case Intrinsic::aarch64_sve_sunpkhi: | |||
| 1695 | case Intrinsic::aarch64_sve_sunpklo: | |||
| 1696 | return instCombineSVEUnpack(IC, II); | |||
| 1697 | case Intrinsic::aarch64_sve_zip1: | |||
| 1698 | case Intrinsic::aarch64_sve_zip2: | |||
| 1699 | return instCombineSVEZip(IC, II); | |||
| 1700 | case Intrinsic::aarch64_sve_ld1_gather_index: | |||
| 1701 | return instCombineLD1GatherIndex(IC, II); | |||
| 1702 | case Intrinsic::aarch64_sve_st1_scatter_index: | |||
| 1703 | return instCombineST1ScatterIndex(IC, II); | |||
| 1704 | case Intrinsic::aarch64_sve_ld1: | |||
| 1705 | return instCombineSVELD1(IC, II, DL); | |||
| 1706 | case Intrinsic::aarch64_sve_st1: | |||
| 1707 | return instCombineSVEST1(IC, II, DL); | |||
| 1708 | case Intrinsic::aarch64_sve_sdiv: | |||
| 1709 | return instCombineSVESDIV(IC, II); | |||
| 1710 | case Intrinsic::aarch64_sve_sel: | |||
| 1711 | return instCombineSVESel(IC, II); | |||
| 1712 | case Intrinsic::aarch64_sve_srshl: | |||
| 1713 | return instCombineSVESrshl(IC, II); | |||
| 1714 | case Intrinsic::aarch64_sve_dupq_lane: | |||
| 1715 | return instCombineSVEDupqLane(IC, II); | |||
| 1716 | } | |||
| 1717 | ||||
| 1718 | return std::nullopt; | |||
| 1719 | } | |||
| 1720 | ||||
| 1721 | std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( | |||
| 1722 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, | |||
| 1723 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, | |||
| 1724 | std::function<void(Instruction *, unsigned, APInt, APInt &)> | |||
| 1725 | SimplifyAndSetOp) const { | |||
| 1726 | switch (II.getIntrinsicID()) { | |||
| 1727 | default: | |||
| 1728 | break; | |||
| 1729 | case Intrinsic::aarch64_neon_fcvtxn: | |||
| 1730 | case Intrinsic::aarch64_neon_rshrn: | |||
| 1731 | case Intrinsic::aarch64_neon_sqrshrn: | |||
| 1732 | case Intrinsic::aarch64_neon_sqrshrun: | |||
| 1733 | case Intrinsic::aarch64_neon_sqshrn: | |||
| 1734 | case Intrinsic::aarch64_neon_sqshrun: | |||
| 1735 | case Intrinsic::aarch64_neon_sqxtn: | |||
| 1736 | case Intrinsic::aarch64_neon_sqxtun: | |||
| 1737 | case Intrinsic::aarch64_neon_uqrshrn: | |||
| 1738 | case Intrinsic::aarch64_neon_uqshrn: | |||
| 1739 | case Intrinsic::aarch64_neon_uqxtn: | |||
| 1740 | SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); | |||
| 1741 | break; | |||
| 1742 | } | |||
| 1743 | ||||
| 1744 | return std::nullopt; | |||
| 1745 | } | |||
| 1746 | ||||
| 1747 | TypeSize | |||
| 1748 | AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | |||
| 1749 | switch (K) { | |||
| 1750 | case TargetTransformInfo::RGK_Scalar: | |||
| 1751 | return TypeSize::getFixed(64); | |||
| 1752 | case TargetTransformInfo::RGK_FixedWidthVector: | |||
| 1753 | if (!ST->isStreamingSVEModeDisabled() && | |||
| 1754 | !EnableFixedwidthAutovecInStreamingMode) | |||
| 1755 | return TypeSize::getFixed(0); | |||
| 1756 | ||||
| 1757 | if (ST->hasSVE()) | |||
| 1758 | return TypeSize::getFixed( | |||
| 1759 | std::max(ST->getMinSVEVectorSizeInBits(), 128u)); | |||
| 1760 | ||||
| 1761 | return TypeSize::getFixed(ST->hasNEON() ? 128 : 0); | |||
| 1762 | case TargetTransformInfo::RGK_ScalableVector: | |||
| 1763 | if (!ST->isStreamingSVEModeDisabled() && !EnableScalableAutovecInStreamingMode) | |||
| 1764 | return TypeSize::getScalable(0); | |||
| 1765 | ||||
| 1766 | return TypeSize::getScalable(ST->hasSVE() ? 128 : 0); | |||
| 1767 | } | |||
| 1768 | llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1768 ); | |||
| 1769 | } | |||
| 1770 | ||||
| 1771 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, | |||
| 1772 | ArrayRef<const Value *> Args) { | |||
| 1773 | ||||
| 1774 | // A helper that returns a vector type from the given type. The number of | |||
| 1775 | // elements in type Ty determines the vector width. | |||
| 1776 | auto toVectorTy = [&](Type *ArgTy) { | |||
| 1777 | return VectorType::get(ArgTy->getScalarType(), | |||
| 1778 | cast<VectorType>(DstTy)->getElementCount()); | |||
| 1779 | }; | |||
| 1780 | ||||
| 1781 | // Exit early if DstTy is not a vector type whose elements are at least | |||
| 1782 | // 16-bits wide. SVE doesn't generally have the same set of instructions to | |||
| 1783 | // perform an extend with the add/sub/mul. There are SMULLB style | |||
| 1784 | // instructions, but they operate on top/bottom, requiring some sort of lane | |||
| 1785 | // interleaving to be used with zext/sext. | |||
| 1786 | if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16) | |||
| 1787 | return false; | |||
| 1788 | ||||
| 1789 | // Determine if the operation has a widening variant. We consider both the | |||
| 1790 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the | |||
| 1791 | // instructions. | |||
| 1792 | // | |||
| 1793 | // TODO: Add additional widening operations (e.g., shl, etc.) once we | |||
| 1794 | // verify that their extending operands are eliminated during code | |||
| 1795 | // generation. | |||
| 1796 | switch (Opcode) { | |||
| 1797 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). | |||
| 1798 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). | |||
| 1799 | case Instruction::Mul: // SMULL(2), UMULL(2) | |||
| 1800 | break; | |||
| 1801 | default: | |||
| 1802 | return false; | |||
| 1803 | } | |||
| 1804 | ||||
| 1805 | // To be a widening instruction (either the "wide" or "long" versions), the | |||
| 1806 | // second operand must be a sign- or zero extend. | |||
| 1807 | if (Args.size() != 2 || | |||
| 1808 | (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1]))) | |||
| 1809 | return false; | |||
| 1810 | auto *Extend = cast<CastInst>(Args[1]); | |||
| 1811 | auto *Arg0 = dyn_cast<CastInst>(Args[0]); | |||
| 1812 | ||||
| 1813 | // A mul only has a mull version (not like addw). Both operands need to be | |||
| 1814 | // extending and the same type. | |||
| 1815 | if (Opcode == Instruction::Mul && | |||
| 1816 | (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() || | |||
| 1817 | Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType())) | |||
| 1818 | return false; | |||
| 1819 | ||||
| 1820 | // Legalize the destination type and ensure it can be used in a widening | |||
| 1821 | // operation. | |||
| 1822 | auto DstTyL = getTypeLegalizationCost(DstTy); | |||
| 1823 | unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); | |||
| 1824 | if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) | |||
| 1825 | return false; | |||
| 1826 | ||||
| 1827 | // Legalize the source type and ensure it can be used in a widening | |||
| 1828 | // operation. | |||
| 1829 | auto *SrcTy = toVectorTy(Extend->getSrcTy()); | |||
| 1830 | auto SrcTyL = getTypeLegalizationCost(SrcTy); | |||
| 1831 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); | |||
| 1832 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) | |||
| 1833 | return false; | |||
| 1834 | ||||
| 1835 | // Get the total number of vector elements in the legalized types. | |||
| 1836 | InstructionCost NumDstEls = | |||
| 1837 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); | |||
| 1838 | InstructionCost NumSrcEls = | |||
| 1839 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); | |||
| 1840 | ||||
| 1841 | // Return true if the legalized types have the same number of vector elements | |||
| 1842 | // and the destination element type size is twice that of the source type. | |||
| 1843 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; | |||
| 1844 | } | |||
| 1845 | ||||
| 1846 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
| 1847 | Type *Src, | |||
| 1848 | TTI::CastContextHint CCH, | |||
| 1849 | TTI::TargetCostKind CostKind, | |||
| 1850 | const Instruction *I) { | |||
| 1851 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 1852 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 1852 , __extension__ __PRETTY_FUNCTION__)); | |||
| 1853 | ||||
| 1854 | // If the cast is observable, and it is used by a widening instruction (e.g., | |||
| 1855 | // uaddl, saddw, etc.), it may be free. | |||
| 1856 | if (I && I->hasOneUser()) { | |||
| 1857 | auto *SingleUser = cast<Instruction>(*I->user_begin()); | |||
| 1858 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); | |||
| 1859 | if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { | |||
| 1860 | // If the cast is the second operand, it is free. We will generate either | |||
| 1861 | // a "wide" or "long" version of the widening instruction. | |||
| 1862 | if (I == SingleUser->getOperand(1)) | |||
| 1863 | return 0; | |||
| 1864 | // If the cast is not the second operand, it will be free if it looks the | |||
| 1865 | // same as the second operand. In this case, we will generate a "long" | |||
| 1866 | // version of the widening instruction. | |||
| 1867 | if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1))) | |||
| 1868 | if (I->getOpcode() == unsigned(Cast->getOpcode()) && | |||
| 1869 | cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy()) | |||
| 1870 | return 0; | |||
| 1871 | } | |||
| 1872 | } | |||
| 1873 | ||||
| 1874 | // TODO: Allow non-throughput costs that aren't binary. | |||
| 1875 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | |||
| 1876 | if (CostKind != TTI::TCK_RecipThroughput) | |||
| 1877 | return Cost == 0 ? 0 : 1; | |||
| 1878 | return Cost; | |||
| 1879 | }; | |||
| 1880 | ||||
| 1881 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
| 1882 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
| 1883 | ||||
| 1884 | if (!SrcTy.isSimple() || !DstTy.isSimple()) | |||
| 1885 | return AdjustCost( | |||
| 1886 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
| 1887 | ||||
| 1888 | static const TypeConversionCostTblEntry | |||
| 1889 | ConversionTbl[] = { | |||
| 1890 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn | |||
| 1891 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn | |||
| 1892 | { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn | |||
| 1893 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn | |||
| 1894 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 | |||
| 1895 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn | |||
| 1896 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn | |||
| 1897 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 | |||
| 1898 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn | |||
| 1899 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn | |||
| 1900 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn | |||
| 1901 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 | |||
| 1902 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 | |||
| 1903 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 | |||
| 1904 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 | |||
| 1905 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 | |||
| 1906 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 | |||
| 1907 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 | |||
| 1908 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 | |||
| 1909 | { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 | |||
| 1910 | ||||
| 1911 | // Truncations on nxvmiN | |||
| 1912 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, | |||
| 1913 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, | |||
| 1914 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, | |||
| 1915 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, | |||
| 1916 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, | |||
| 1917 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, | |||
| 1918 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, | |||
| 1919 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, | |||
| 1920 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, | |||
| 1921 | { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, | |||
| 1922 | { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, | |||
| 1923 | { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, | |||
| 1924 | { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, | |||
| 1925 | { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, | |||
| 1926 | { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, | |||
| 1927 | { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, | |||
| 1928 | ||||
| 1929 | // The number of shll instructions for the extension. | |||
| 1930 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
| 1931 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, | |||
| 1932 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
| 1933 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
| 1934 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
| 1935 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, | |||
| 1936 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
| 1937 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
| 1938 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
| 1939 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, | |||
| 1940 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
| 1941 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, | |||
| 1942 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
| 1943 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
| 1944 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
| 1945 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, | |||
| 1946 | ||||
| 1947 | // LowerVectorINT_TO_FP: | |||
| 1948 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
| 1949 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
| 1950 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
| 1951 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
| 1952 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
| 1953 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
| 1954 | ||||
| 1955 | // Complex: to v2f32 | |||
| 1956 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
| 1957 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, | |||
| 1958 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, | |||
| 1959 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, | |||
| 1960 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, | |||
| 1961 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, | |||
| 1962 | ||||
| 1963 | // Complex: to v4f32 | |||
| 1964 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, | |||
| 1965 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
| 1966 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, | |||
| 1967 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, | |||
| 1968 | ||||
| 1969 | // Complex: to v8f32 | |||
| 1970 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, | |||
| 1971 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
| 1972 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, | |||
| 1973 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
| 1974 | ||||
| 1975 | // Complex: to v16f32 | |||
| 1976 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, | |||
| 1977 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, | |||
| 1978 | ||||
| 1979 | // Complex: to v2f64 | |||
| 1980 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
| 1981 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, | |||
| 1982 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
| 1983 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, | |||
| 1984 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, | |||
| 1985 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, | |||
| 1986 | ||||
| 1987 | // Complex: to v4f64 | |||
| 1988 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, | |||
| 1989 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, | |||
| 1990 | ||||
| 1991 | // LowerVectorFP_TO_INT | |||
| 1992 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
| 1993 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
| 1994 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
| 1995 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, | |||
| 1996 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
| 1997 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
| 1998 | ||||
| 1999 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). | |||
| 2000 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, | |||
| 2001 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, | |||
| 2002 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, | |||
| 2003 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, | |||
| 2004 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, | |||
| 2005 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, | |||
| 2006 | ||||
| 2007 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 | |||
| 2008 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
| 2009 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, | |||
| 2010 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, | |||
| 2011 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, | |||
| 2012 | ||||
| 2013 | // Complex, from nxv2f32. | |||
| 2014 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, | |||
| 2015 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, | |||
| 2016 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, | |||
| 2017 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, | |||
| 2018 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, | |||
| 2019 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, | |||
| 2020 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, | |||
| 2021 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, | |||
| 2022 | ||||
| 2023 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. | |||
| 2024 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
| 2025 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
| 2026 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, | |||
| 2027 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, | |||
| 2028 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, | |||
| 2029 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, | |||
| 2030 | ||||
| 2031 | // Complex, from nxv2f64. | |||
| 2032 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, | |||
| 2033 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, | |||
| 2034 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, | |||
| 2035 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, | |||
| 2036 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, | |||
| 2037 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, | |||
| 2038 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, | |||
| 2039 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, | |||
| 2040 | ||||
| 2041 | // Complex, from nxv4f32. | |||
| 2042 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, | |||
| 2043 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, | |||
| 2044 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, | |||
| 2045 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, | |||
| 2046 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, | |||
| 2047 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, | |||
| 2048 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, | |||
| 2049 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, | |||
| 2050 | ||||
| 2051 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. | |||
| 2052 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, | |||
| 2053 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, | |||
| 2054 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, | |||
| 2055 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, | |||
| 2056 | ||||
| 2057 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. | |||
| 2058 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, | |||
| 2059 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, | |||
| 2060 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, | |||
| 2061 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, | |||
| 2062 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, | |||
| 2063 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, | |||
| 2064 | ||||
| 2065 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. | |||
| 2066 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, | |||
| 2067 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, | |||
| 2068 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, | |||
| 2069 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, | |||
| 2070 | ||||
| 2071 | // Complex, from nxv8f16. | |||
| 2072 | { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, | |||
| 2073 | { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, | |||
| 2074 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, | |||
| 2075 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, | |||
| 2076 | { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, | |||
| 2077 | { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, | |||
| 2078 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, | |||
| 2079 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, | |||
| 2080 | ||||
| 2081 | // Complex, from nxv4f16. | |||
| 2082 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, | |||
| 2083 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, | |||
| 2084 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, | |||
| 2085 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, | |||
| 2086 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, | |||
| 2087 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, | |||
| 2088 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, | |||
| 2089 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, | |||
| 2090 | ||||
| 2091 | // Complex, from nxv2f16. | |||
| 2092 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, | |||
| 2093 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, | |||
| 2094 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, | |||
| 2095 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, | |||
| 2096 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, | |||
| 2097 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, | |||
| 2098 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, | |||
| 2099 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, | |||
| 2100 | ||||
| 2101 | // Truncate from nxvmf32 to nxvmf16. | |||
| 2102 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, | |||
| 2103 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, | |||
| 2104 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, | |||
| 2105 | ||||
| 2106 | // Truncate from nxvmf64 to nxvmf16. | |||
| 2107 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, | |||
| 2108 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, | |||
| 2109 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, | |||
| 2110 | ||||
| 2111 | // Truncate from nxvmf64 to nxvmf32. | |||
| 2112 | { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, | |||
| 2113 | { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, | |||
| 2114 | { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, | |||
| 2115 | ||||
| 2116 | // Extend from nxvmf16 to nxvmf32. | |||
| 2117 | { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, | |||
| 2118 | { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, | |||
| 2119 | { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, | |||
| 2120 | ||||
| 2121 | // Extend from nxvmf16 to nxvmf64. | |||
| 2122 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, | |||
| 2123 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, | |||
| 2124 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, | |||
| 2125 | ||||
| 2126 | // Extend from nxvmf32 to nxvmf64. | |||
| 2127 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, | |||
| 2128 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, | |||
| 2129 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, | |||
| 2130 | ||||
| 2131 | // Bitcasts from float to integer | |||
| 2132 | { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, | |||
| 2133 | { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, | |||
| 2134 | { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, | |||
| 2135 | ||||
| 2136 | // Bitcasts from integer to float | |||
| 2137 | { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, | |||
| 2138 | { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, | |||
| 2139 | { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, | |||
| 2140 | ||||
| 2141 | // Add cost for extending to illegal -too wide- scalable vectors. | |||
| 2142 | // zero/sign extend are implemented by multiple unpack operations, | |||
| 2143 | // where each operation has a cost of 1. | |||
| 2144 | { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, | |||
| 2145 | { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, | |||
| 2146 | { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, | |||
| 2147 | { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, | |||
| 2148 | { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, | |||
| 2149 | { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, | |||
| 2150 | ||||
| 2151 | { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, | |||
| 2152 | { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, | |||
| 2153 | { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, | |||
| 2154 | { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, | |||
| 2155 | { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, | |||
| 2156 | { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, | |||
| 2157 | }; | |||
| 2158 | ||||
| 2159 | if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, | |||
| 2160 | DstTy.getSimpleVT(), | |||
| 2161 | SrcTy.getSimpleVT())) | |||
| 2162 | return AdjustCost(Entry->Cost); | |||
| 2163 | ||||
| 2164 | static const TypeConversionCostTblEntry FP16Tbl[] = { | |||
| 2165 | {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs | |||
| 2166 | {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, | |||
| 2167 | {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs | |||
| 2168 | {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, | |||
| 2169 | {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs | |||
| 2170 | {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, | |||
| 2171 | {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn | |||
| 2172 | {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, | |||
| 2173 | {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs | |||
| 2174 | {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, | |||
| 2175 | {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs | |||
| 2176 | {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, | |||
| 2177 | {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn | |||
| 2178 | {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, | |||
| 2179 | {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs | |||
| 2180 | {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, | |||
| 2181 | {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs | |||
| 2182 | {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, | |||
| 2183 | {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf | |||
| 2184 | {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf | |||
| 2185 | {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf | |||
| 2186 | {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf | |||
| 2187 | }; | |||
| 2188 | ||||
| 2189 | if (ST->hasFullFP16()) | |||
| 2190 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2191 | FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) | |||
| 2192 | return AdjustCost(Entry->Cost); | |||
| 2193 | ||||
| 2194 | // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, | |||
| 2195 | // but we also want to include the TTI::CastContextHint::Masked case too. | |||
| 2196 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && | |||
| 2197 | CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() && | |||
| 2198 | TLI->isTypeLegal(DstTy)) | |||
| 2199 | CCH = TTI::CastContextHint::Normal; | |||
| 2200 | ||||
| 2201 | return AdjustCost( | |||
| 2202 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
| 2203 | } | |||
| 2204 | ||||
| 2205 | InstructionCost AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, | |||
| 2206 | Type *Dst, | |||
| 2207 | VectorType *VecTy, | |||
| 2208 | unsigned Index) { | |||
| 2209 | ||||
| 2210 | // Make sure we were given a valid extend opcode. | |||
| 2211 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&(static_cast <bool> ((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && "Invalid opcode") ? void (0 ) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2212 , __extension__ __PRETTY_FUNCTION__)) | |||
| 2212 | "Invalid opcode")(static_cast <bool> ((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && "Invalid opcode") ? void (0 ) : __assert_fail ("(Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2212 , __extension__ __PRETTY_FUNCTION__)); | |||
| 2213 | ||||
| 2214 | // We are extending an element we extract from a vector, so the source type | |||
| 2215 | // of the extend is the element type of the vector. | |||
| 2216 | auto *Src = VecTy->getElementType(); | |||
| 2217 | ||||
| 2218 | // Sign- and zero-extends are for integer types only. | |||
| 2219 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type")(static_cast <bool> (isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type") ? void (0) : __assert_fail ("isa<IntegerType>(Dst) && isa<IntegerType>(Src) && \"Invalid type\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2219 , __extension__ __PRETTY_FUNCTION__)); | |||
| 2220 | ||||
| 2221 | // Get the cost for the extract. We compute the cost (if any) for the extend | |||
| 2222 | // below. | |||
| 2223 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
| 2224 | InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, | |||
| 2225 | CostKind, Index, nullptr, nullptr); | |||
| 2226 | ||||
| 2227 | // Legalize the types. | |||
| 2228 | auto VecLT = getTypeLegalizationCost(VecTy); | |||
| 2229 | auto DstVT = TLI->getValueType(DL, Dst); | |||
| 2230 | auto SrcVT = TLI->getValueType(DL, Src); | |||
| 2231 | ||||
| 2232 | // If the resulting type is still a vector and the destination type is legal, | |||
| 2233 | // we may get the extension for free. If not, get the default cost for the | |||
| 2234 | // extend. | |||
| 2235 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT)) | |||
| 2236 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
| 2237 | CostKind); | |||
| 2238 | ||||
| 2239 | // The destination type should be larger than the element type. If not, get | |||
| 2240 | // the default cost for the extend. | |||
| 2241 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) | |||
| 2242 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
| 2243 | CostKind); | |||
| 2244 | ||||
| 2245 | switch (Opcode) { | |||
| 2246 | default: | |||
| 2247 | llvm_unreachable("Opcode should be either SExt or ZExt")::llvm::llvm_unreachable_internal("Opcode should be either SExt or ZExt" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2247 ); | |||
| 2248 | ||||
| 2249 | // For sign-extends, we only need a smov, which performs the extension | |||
| 2250 | // automatically. | |||
| 2251 | case Instruction::SExt: | |||
| 2252 | return Cost; | |||
| 2253 | ||||
| 2254 | // For zero-extends, the extend is performed automatically by a umov unless | |||
| 2255 | // the destination type is i64 and the element type is i8 or i16. | |||
| 2256 | case Instruction::ZExt: | |||
| 2257 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) | |||
| 2258 | return Cost; | |||
| 2259 | } | |||
| 2260 | ||||
| 2261 | // If we are unable to perform the extend for free, get the default cost. | |||
| 2262 | return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None, | |||
| 2263 | CostKind); | |||
| 2264 | } | |||
| 2265 | ||||
| 2266 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, | |||
| 2267 | TTI::TargetCostKind CostKind, | |||
| 2268 | const Instruction *I) { | |||
| 2269 | if (CostKind != TTI::TCK_RecipThroughput) | |||
| 2270 | return Opcode == Instruction::PHI ? 0 : 1; | |||
| 2271 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind")(static_cast <bool> (CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind") ? void (0) : __assert_fail ("CostKind == TTI::TCK_RecipThroughput && \"unexpected CostKind\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2271 , __extension__ __PRETTY_FUNCTION__)); | |||
| 2272 | // Branches are assumed to be predicted. | |||
| 2273 | return 0; | |||
| 2274 | } | |||
| 2275 | ||||
| 2276 | InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, | |||
| 2277 | Type *Val, | |||
| 2278 | unsigned Index, | |||
| 2279 | bool HasRealUse) { | |||
| 2280 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2280 , __extension__ __PRETTY_FUNCTION__)); | |||
| 2281 | ||||
| 2282 | if (Index != -1U) { | |||
| 2283 | // Legalize the type. | |||
| 2284 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); | |||
| 2285 | ||||
| 2286 | // This type is legalized to a scalar type. | |||
| 2287 | if (!LT.second.isVector()) | |||
| 2288 | return 0; | |||
| 2289 | ||||
| 2290 | // The type may be split. For fixed-width vectors we can normalize the | |||
| 2291 | // index to the new type. | |||
| 2292 | if (LT.second.isFixedLengthVector()) { | |||
| 2293 | unsigned Width = LT.second.getVectorNumElements(); | |||
| 2294 | Index = Index % Width; | |||
| 2295 | } | |||
| 2296 | ||||
| 2297 | // The element at index zero is already inside the vector. | |||
| 2298 | // - For a physical (HasRealUse==true) insert-element or extract-element | |||
| 2299 | // instruction that extracts integers, an explicit FPR -> GPR move is | |||
| 2300 | // needed. So it has non-zero cost. | |||
| 2301 | // - For the rest of cases (virtual instruction or element type is float), | |||
| 2302 | // consider the instruction free. | |||
| 2303 | if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) | |||
| 2304 | return 0; | |||
| 2305 | ||||
| 2306 | // This is recognising a LD1 single-element structure to one lane of one | |||
| 2307 | // register instruction. I.e., if this is an `insertelement` instruction, | |||
| 2308 | // and its second operand is a load, then we will generate a LD1, which | |||
| 2309 | // are expensive instructions. | |||
| 2310 | if (I && dyn_cast<LoadInst>(I->getOperand(1))) | |||
| 2311 | return ST->getVectorInsertExtractBaseCost() + 1; | |||
| 2312 | ||||
| 2313 | // FIXME: | |||
| 2314 | // If the extract-element and insert-element instructions could be | |||
| 2315 | // simplified away (e.g., could be combined into users by looking at use-def | |||
| 2316 | // context), they have no cost. This is not done in the first place for | |||
| 2317 | // compile-time considerations. | |||
| 2318 | } | |||
| 2319 | ||||
| 2320 | // All other insert/extracts cost this much. | |||
| 2321 | return ST->getVectorInsertExtractBaseCost(); | |||
| 2322 | } | |||
| 2323 | ||||
| 2324 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | |||
| 2325 | TTI::TargetCostKind CostKind, | |||
| 2326 | unsigned Index, Value *Op0, | |||
| 2327 | Value *Op1) { | |||
| 2328 | bool HasRealUse = | |||
| 2329 | Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0); | |||
| 2330 | return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse); | |||
| 2331 | } | |||
| 2332 | ||||
| 2333 | InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, | |||
| 2334 | Type *Val, | |||
| 2335 | TTI::TargetCostKind CostKind, | |||
| 2336 | unsigned Index) { | |||
| 2337 | return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */); | |||
| 2338 | } | |||
| 2339 | ||||
| 2340 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( | |||
| 2341 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | |||
| 2342 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, | |||
| 2343 | ArrayRef<const Value *> Args, | |||
| 2344 | const Instruction *CxtI) { | |||
| 2345 | ||||
| 2346 | // TODO: Handle more cost kinds. | |||
| 2347 | if (CostKind != TTI::TCK_RecipThroughput) | |||
| 2348 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | |||
| 2349 | Op2Info, Args, CxtI); | |||
| 2350 | ||||
| 2351 | // Legalize the type. | |||
| 2352 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
| 2353 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 2354 | ||||
| 2355 | switch (ISD) { | |||
| 2356 | default: | |||
| 2357 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | |||
| 2358 | Op2Info); | |||
| 2359 | case ISD::SDIV: | |||
| 2360 | if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { | |||
| 2361 | // On AArch64, scalar signed division by constants power-of-two are | |||
| 2362 | // normally expanded to the sequence ADD + CMP + SELECT + SRA. | |||
| 2363 | // The OperandValue properties many not be same as that of previous | |||
| 2364 | // operation; conservatively assume OP_None. | |||
| 2365 | InstructionCost Cost = getArithmeticInstrCost( | |||
| 2366 | Instruction::Add, Ty, CostKind, | |||
| 2367 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 2368 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, | |||
| 2369 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 2370 | Cost += getArithmeticInstrCost( | |||
| 2371 | Instruction::Select, Ty, CostKind, | |||
| 2372 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 2373 | Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | |||
| 2374 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 2375 | return Cost; | |||
| 2376 | } | |||
| 2377 | [[fallthrough]]; | |||
| 2378 | case ISD::UDIV: { | |||
| 2379 | if (Op2Info.isConstant() && Op2Info.isUniform()) { | |||
| 2380 | auto VT = TLI->getValueType(DL, Ty); | |||
| 2381 | if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) { | |||
| 2382 | // Vector signed division by constant are expanded to the | |||
| 2383 | // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division | |||
| 2384 | // to MULHS + SUB + SRL + ADD + SRL. | |||
| 2385 | InstructionCost MulCost = getArithmeticInstrCost( | |||
| 2386 | Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 2387 | InstructionCost AddCost = getArithmeticInstrCost( | |||
| 2388 | Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 2389 | InstructionCost ShrCost = getArithmeticInstrCost( | |||
| 2390 | Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 2391 | return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; | |||
| 2392 | } | |||
| 2393 | } | |||
| 2394 | ||||
| 2395 | InstructionCost Cost = BaseT::getArithmeticInstrCost( | |||
| 2396 | Opcode, Ty, CostKind, Op1Info, Op2Info); | |||
| 2397 | if (Ty->isVectorTy()) { | |||
| 2398 | if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { | |||
| 2399 | // SDIV/UDIV operations are lowered using SVE, then we can have less | |||
| 2400 | // costs. | |||
| 2401 | if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) | |||
| 2402 | ->getPrimitiveSizeInBits() | |||
| 2403 | .getFixedValue() < 128) { | |||
| 2404 | EVT VT = TLI->getValueType(DL, Ty); | |||
| 2405 | static const CostTblEntry DivTbl[]{ | |||
| 2406 | {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, | |||
| 2407 | {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, | |||
| 2408 | {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, | |||
| 2409 | {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, | |||
| 2410 | {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, | |||
| 2411 | {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; | |||
| 2412 | ||||
| 2413 | const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); | |||
| 2414 | if (nullptr != Entry) | |||
| 2415 | return Entry->Cost; | |||
| 2416 | } | |||
| 2417 | // For 8/16-bit elements, the cost is higher because the type | |||
| 2418 | // requires promotion and possibly splitting: | |||
| 2419 | if (LT.second.getScalarType() == MVT::i8) | |||
| 2420 | Cost *= 8; | |||
| 2421 | else if (LT.second.getScalarType() == MVT::i16) | |||
| 2422 | Cost *= 4; | |||
| 2423 | return Cost; | |||
| 2424 | } else { | |||
| 2425 | // If one of the operands is a uniform constant then the cost for each | |||
| 2426 | // element is Cost for insertion, extraction and division. | |||
| 2427 | // Insertion cost = 2, Extraction Cost = 2, Division = cost for the | |||
| 2428 | // operation with scalar type | |||
| 2429 | if ((Op1Info.isConstant() && Op1Info.isUniform()) || | |||
| 2430 | (Op2Info.isConstant() && Op2Info.isUniform())) { | |||
| 2431 | if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { | |||
| 2432 | InstructionCost DivCost = BaseT::getArithmeticInstrCost( | |||
| 2433 | Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info); | |||
| 2434 | return (4 + DivCost) * VTy->getNumElements(); | |||
| 2435 | } | |||
| 2436 | } | |||
| 2437 | // On AArch64, without SVE, vector divisions are expanded | |||
| 2438 | // into scalar divisions of each pair of elements. | |||
| 2439 | Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, | |||
| 2440 | CostKind, Op1Info, Op2Info); | |||
| 2441 | Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind, | |||
| 2442 | Op1Info, Op2Info); | |||
| 2443 | } | |||
| 2444 | ||||
| 2445 | // TODO: if one of the arguments is scalar, then it's not necessary to | |||
| 2446 | // double the cost of handling the vector elements. | |||
| 2447 | Cost += Cost; | |||
| 2448 | } | |||
| 2449 | return Cost; | |||
| 2450 | } | |||
| 2451 | case ISD::MUL: | |||
| 2452 | // When SVE is available, then we can lower the v2i64 operation using | |||
| 2453 | // the SVE mul instruction, which has a lower cost. | |||
| 2454 | if (LT.second == MVT::v2i64 && ST->hasSVE()) | |||
| 2455 | return LT.first; | |||
| 2456 | ||||
| 2457 | // When SVE is not available, there is no MUL.2d instruction, | |||
| 2458 | // which means mul <2 x i64> is expensive as elements are extracted | |||
| 2459 | // from the vectors and the muls scalarized. | |||
| 2460 | // As getScalarizationOverhead is a bit too pessimistic, we | |||
| 2461 | // estimate the cost for a i64 vector directly here, which is: | |||
| 2462 | // - four 2-cost i64 extracts, | |||
| 2463 | // - two 2-cost i64 inserts, and | |||
| 2464 | // - two 1-cost muls. | |||
| 2465 | // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with | |||
| 2466 | // LT.first = 2 the cost is 28. If both operands are extensions it will not | |||
| 2467 | // need to scalarize so the cost can be cheaper (smull or umull). | |||
| 2468 | // so the cost can be cheaper (smull or umull). | |||
| 2469 | if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) | |||
| 2470 | return LT.first; | |||
| 2471 | return LT.first * 14; | |||
| 2472 | case ISD::ADD: | |||
| 2473 | case ISD::XOR: | |||
| 2474 | case ISD::OR: | |||
| 2475 | case ISD::AND: | |||
| 2476 | case ISD::SRL: | |||
| 2477 | case ISD::SRA: | |||
| 2478 | case ISD::SHL: | |||
| 2479 | // These nodes are marked as 'custom' for combining purposes only. | |||
| 2480 | // We know that they are legal. See LowerAdd in ISelLowering. | |||
| 2481 | return LT.first; | |||
| 2482 | ||||
| 2483 | case ISD::FNEG: | |||
| 2484 | case ISD::FADD: | |||
| 2485 | case ISD::FSUB: | |||
| 2486 | // Increase the cost for half and bfloat types if not architecturally | |||
| 2487 | // supported. | |||
| 2488 | if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || | |||
| 2489 | (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) | |||
| 2490 | return 2 * LT.first; | |||
| 2491 | if (!Ty->getScalarType()->isFP128Ty()) | |||
| 2492 | return LT.first; | |||
| 2493 | LLVM_FALLTHROUGH[[fallthrough]]; | |||
| 2494 | case ISD::FMUL: | |||
| 2495 | case ISD::FDIV: | |||
| 2496 | // These nodes are marked as 'custom' just to lower them to SVE. | |||
| 2497 | // We know said lowering will incur no additional cost. | |||
| 2498 | if (!Ty->getScalarType()->isFP128Ty()) | |||
| 2499 | return 2 * LT.first; | |||
| 2500 | ||||
| 2501 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | |||
| 2502 | Op2Info); | |||
| 2503 | } | |||
| 2504 | } | |||
| 2505 | ||||
| 2506 | InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, | |||
| 2507 | ScalarEvolution *SE, | |||
| 2508 | const SCEV *Ptr) { | |||
| 2509 | // Address computations in vectorized code with non-consecutive addresses will | |||
| 2510 | // likely result in more instructions compared to scalar code where the | |||
| 2511 | // computation can more often be merged into the index mode. The resulting | |||
| 2512 | // extra micro-ops can significantly decrease throughput. | |||
| 2513 | unsigned NumVectorInstToHideOverhead = 10; | |||
| 2514 | int MaxMergeDistance = 64; | |||
| 2515 | ||||
| 2516 | if (Ty->isVectorTy() && SE && | |||
| 2517 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1)) | |||
| 2518 | return NumVectorInstToHideOverhead; | |||
| 2519 | ||||
| 2520 | // In many cases the address computation is not merged into the instruction | |||
| 2521 | // addressing mode. | |||
| 2522 | return 1; | |||
| 2523 | } | |||
| 2524 | ||||
| 2525 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | |||
| 2526 | Type *CondTy, | |||
| 2527 | CmpInst::Predicate VecPred, | |||
| 2528 | TTI::TargetCostKind CostKind, | |||
| 2529 | const Instruction *I) { | |||
| 2530 | // TODO: Handle other cost kinds. | |||
| 2531 | if (CostKind != TTI::TCK_RecipThroughput) | |||
| 2532 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | |||
| 2533 | I); | |||
| 2534 | ||||
| 2535 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 2536 | // We don't lower some vector selects well that are wider than the register | |||
| 2537 | // width. | |||
| 2538 | if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) { | |||
| 2539 | // We would need this many instructions to hide the scalarization happening. | |||
| 2540 | const int AmortizationCost = 20; | |||
| 2541 | ||||
| 2542 | // If VecPred is not set, check if we can get a predicate from the context | |||
| 2543 | // instruction, if its type matches the requested ValTy. | |||
| 2544 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { | |||
| 2545 | CmpInst::Predicate CurrentPred; | |||
| 2546 | if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(), | |||
| 2547 | m_Value()))) | |||
| 2548 | VecPred = CurrentPred; | |||
| 2549 | } | |||
| 2550 | // Check if we have a compare/select chain that can be lowered using | |||
| 2551 | // a (F)CMxx & BFI pair. | |||
| 2552 | if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE || | |||
| 2553 | VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || | |||
| 2554 | VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || | |||
| 2555 | VecPred == CmpInst::FCMP_UNE) { | |||
| 2556 | static const auto ValidMinMaxTys = { | |||
| 2557 | MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, | |||
| 2558 | MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; | |||
| 2559 | static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; | |||
| 2560 | ||||
| 2561 | auto LT = getTypeLegalizationCost(ValTy); | |||
| 2562 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || | |||
| 2563 | (ST->hasFullFP16() && | |||
| 2564 | any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) | |||
| 2565 | return LT.first; | |||
| 2566 | } | |||
| 2567 | ||||
| 2568 | static const TypeConversionCostTblEntry | |||
| 2569 | VectorSelectTbl[] = { | |||
| 2570 | { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, | |||
| 2571 | { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, | |||
| 2572 | { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, | |||
| 2573 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, | |||
| 2574 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, | |||
| 2575 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } | |||
| 2576 | }; | |||
| 2577 | ||||
| 2578 | EVT SelCondTy = TLI->getValueType(DL, CondTy); | |||
| 2579 | EVT SelValTy = TLI->getValueType(DL, ValTy); | |||
| 2580 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { | |||
| 2581 | if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, | |||
| 2582 | SelCondTy.getSimpleVT(), | |||
| 2583 | SelValTy.getSimpleVT())) | |||
| 2584 | return Entry->Cost; | |||
| 2585 | } | |||
| 2586 | } | |||
| 2587 | // The base case handles scalable vectors fine for now, since it treats the | |||
| 2588 | // cost as 1 * legalization cost. | |||
| 2589 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
| 2590 | } | |||
| 2591 | ||||
| 2592 | AArch64TTIImpl::TTI::MemCmpExpansionOptions | |||
| 2593 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | |||
| 2594 | TTI::MemCmpExpansionOptions Options; | |||
| 2595 | if (ST->requiresStrictAlign()) { | |||
| 2596 | // TODO: Add cost modeling for strict align. Misaligned loads expand to | |||
| 2597 | // a bunch of instructions when strict align is enabled. | |||
| 2598 | return Options; | |||
| 2599 | } | |||
| 2600 | Options.AllowOverlappingLoads = true; | |||
| 2601 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | |||
| 2602 | Options.NumLoadsPerBlock = Options.MaxNumLoads; | |||
| 2603 | // TODO: Though vector loads usually perform well on AArch64, in some targets | |||
| 2604 | // they may wake up the FP unit, which raises the power consumption. Perhaps | |||
| 2605 | // they could be used with no holds barred (-O3). | |||
| 2606 | Options.LoadSizes = {8, 4, 2, 1}; | |||
| 2607 | return Options; | |||
| 2608 | } | |||
| 2609 | ||||
| 2610 | bool AArch64TTIImpl::prefersVectorizedAddressing() const { | |||
| 2611 | return ST->hasSVE(); | |||
| 2612 | } | |||
| 2613 | ||||
| 2614 | InstructionCost | |||
| 2615 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, | |||
| 2616 | Align Alignment, unsigned AddressSpace, | |||
| 2617 | TTI::TargetCostKind CostKind) { | |||
| 2618 | if (useNeonVector(Src)) | |||
| 2619 | return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
| 2620 | CostKind); | |||
| 2621 | auto LT = getTypeLegalizationCost(Src); | |||
| 2622 | if (!LT.first.isValid()) | |||
| 2623 | return InstructionCost::getInvalid(); | |||
| 2624 | ||||
| 2625 | // The code-generator is currently not able to handle scalable vectors | |||
| 2626 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
| 2627 | // it. This change will be removed when code-generation for these types is | |||
| 2628 | // sufficiently reliable. | |||
| 2629 | if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1)) | |||
| 2630 | return InstructionCost::getInvalid(); | |||
| 2631 | ||||
| 2632 | return LT.first; | |||
| 2633 | } | |||
| 2634 | ||||
| 2635 | static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { | |||
| 2636 | return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; | |||
| 2637 | } | |||
| 2638 | ||||
| 2639 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( | |||
| 2640 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, | |||
| 2641 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { | |||
| 2642 | if (useNeonVector(DataTy)) | |||
| 2643 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, | |||
| 2644 | Alignment, CostKind, I); | |||
| 2645 | auto *VT = cast<VectorType>(DataTy); | |||
| 2646 | auto LT = getTypeLegalizationCost(DataTy); | |||
| 2647 | if (!LT.first.isValid()) | |||
| 2648 | return InstructionCost::getInvalid(); | |||
| 2649 | ||||
| 2650 | // The code-generator is currently not able to handle scalable vectors | |||
| 2651 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
| 2652 | // it. This change will be removed when code-generation for these types is | |||
| 2653 | // sufficiently reliable. | |||
| 2654 | if (cast<VectorType>(DataTy)->getElementCount() == | |||
| 2655 | ElementCount::getScalable(1)) | |||
| 2656 | return InstructionCost::getInvalid(); | |||
| 2657 | ||||
| 2658 | ElementCount LegalVF = LT.second.getVectorElementCount(); | |||
| 2659 | InstructionCost MemOpCost = | |||
| 2660 | getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, | |||
| 2661 | {TTI::OK_AnyValue, TTI::OP_None}, I); | |||
| 2662 | // Add on an overhead cost for using gathers/scatters. | |||
| 2663 | // TODO: At the moment this is applied unilaterally for all CPUs, but at some | |||
| 2664 | // point we may want a per-CPU overhead. | |||
| 2665 | MemOpCost *= getSVEGatherScatterOverhead(Opcode); | |||
| 2666 | return LT.first * MemOpCost * getMaxNumElements(LegalVF); | |||
| 2667 | } | |||
| 2668 | ||||
| 2669 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { | |||
| 2670 | return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); | |||
| 2671 | } | |||
| 2672 | ||||
| 2673 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, | |||
| 2674 | MaybeAlign Alignment, | |||
| 2675 | unsigned AddressSpace, | |||
| 2676 | TTI::TargetCostKind CostKind, | |||
| 2677 | TTI::OperandValueInfo OpInfo, | |||
| 2678 | const Instruction *I) { | |||
| 2679 | EVT VT = TLI->getValueType(DL, Ty, true); | |||
| 2680 | // Type legalization can't handle structs | |||
| 2681 | if (VT == MVT::Other) | |||
| 2682 | return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, | |||
| 2683 | CostKind); | |||
| 2684 | ||||
| 2685 | auto LT = getTypeLegalizationCost(Ty); | |||
| 2686 | if (!LT.first.isValid()) | |||
| 2687 | return InstructionCost::getInvalid(); | |||
| 2688 | ||||
| 2689 | // The code-generator is currently not able to handle scalable vectors | |||
| 2690 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
| 2691 | // it. This change will be removed when code-generation for these types is | |||
| 2692 | // sufficiently reliable. | |||
| 2693 | if (auto *VTy = dyn_cast<ScalableVectorType>(Ty)) | |||
| 2694 | if (VTy->getElementCount() == ElementCount::getScalable(1)) | |||
| 2695 | return InstructionCost::getInvalid(); | |||
| 2696 | ||||
| 2697 | // TODO: consider latency as well for TCK_SizeAndLatency. | |||
| 2698 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) | |||
| 2699 | return LT.first; | |||
| 2700 | ||||
| 2701 | if (CostKind != TTI::TCK_RecipThroughput) | |||
| 2702 | return 1; | |||
| 2703 | ||||
| 2704 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && | |||
| 2705 | LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { | |||
| 2706 | // Unaligned stores are extremely inefficient. We don't split all | |||
| 2707 | // unaligned 128-bit stores because the negative impact that has shown in | |||
| 2708 | // practice on inlined block copy code. | |||
| 2709 | // We make such stores expensive so that we will only vectorize if there | |||
| 2710 | // are 6 other instructions getting vectorized. | |||
| 2711 | const int AmortizationCost = 6; | |||
| 2712 | ||||
| 2713 | return LT.first * 2 * AmortizationCost; | |||
| 2714 | } | |||
| 2715 | ||||
| 2716 | // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. | |||
| 2717 | if (Ty->isPtrOrPtrVectorTy()) | |||
| 2718 | return LT.first; | |||
| 2719 | ||||
| 2720 | // Check truncating stores and extending loads. | |||
| 2721 | if (useNeonVector(Ty) && | |||
| 2722 | Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { | |||
| 2723 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. | |||
| 2724 | if (VT == MVT::v4i8) | |||
| 2725 | return 2; | |||
| 2726 | // Otherwise we need to scalarize. | |||
| 2727 | return cast<FixedVectorType>(Ty)->getNumElements() * 2; | |||
| 2728 | } | |||
| 2729 | ||||
| 2730 | return LT.first; | |||
| 2731 | } | |||
| 2732 | ||||
| 2733 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( | |||
| 2734 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, | |||
| 2735 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | |||
| 2736 | bool UseMaskForCond, bool UseMaskForGaps) { | |||
| 2737 | assert(Factor >= 2 && "Invalid interleave factor")(static_cast <bool> (Factor >= 2 && "Invalid interleave factor" ) ? void (0) : __assert_fail ("Factor >= 2 && \"Invalid interleave factor\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 2737 , __extension__ __PRETTY_FUNCTION__)); | |||
| 2738 | auto *VecVTy = cast<FixedVectorType>(VecTy); | |||
| 2739 | ||||
| 2740 | if (!UseMaskForCond && !UseMaskForGaps && | |||
| 2741 | Factor <= TLI->getMaxSupportedInterleaveFactor()) { | |||
| 2742 | unsigned NumElts = VecVTy->getNumElements(); | |||
| 2743 | auto *SubVecTy = | |||
| 2744 | FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor); | |||
| 2745 | ||||
| 2746 | // ldN/stN only support legal vector types of size 64 or 128 in bits. | |||
| 2747 | // Accesses having vector types that are a multiple of 128 bits can be | |||
| 2748 | // matched to more than one ldN/stN instruction. | |||
| 2749 | bool UseScalable; | |||
| 2750 | if (NumElts % Factor == 0 && | |||
| 2751 | TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) | |||
| 2752 | return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); | |||
| 2753 | } | |||
| 2754 | ||||
| 2755 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
| 2756 | Alignment, AddressSpace, CostKind, | |||
| 2757 | UseMaskForCond, UseMaskForGaps); | |||
| 2758 | } | |||
| 2759 | ||||
| 2760 | InstructionCost | |||
| 2761 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { | |||
| 2762 | InstructionCost Cost = 0; | |||
| 2763 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
| 2764 | for (auto *I : Tys) { | |||
| 2765 | if (!I->isVectorTy()) | |||
| 2766 | continue; | |||
| 2767 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() == | |||
| 2768 | 128) | |||
| 2769 | Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) + | |||
| 2770 | getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind); | |||
| 2771 | } | |||
| 2772 | return Cost; | |||
| 2773 | } | |||
| 2774 | ||||
| 2775 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { | |||
| 2776 | return ST->getMaxInterleaveFactor(); | |||
| 2777 | } | |||
| 2778 | ||||
| 2779 | // For Falkor, we want to avoid having too many strided loads in a loop since | |||
| 2780 | // that can exhaust the HW prefetcher resources. We adjust the unroller | |||
| 2781 | // MaxCount preference below to attempt to ensure unrolling doesn't create too | |||
| 2782 | // many strided loads. | |||
| 2783 | static void | |||
| 2784 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
| 2785 | TargetTransformInfo::UnrollingPreferences &UP) { | |||
| 2786 | enum { MaxStridedLoads = 7 }; | |||
| 2787 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { | |||
| 2788 | int StridedLoads = 0; | |||
| 2789 | // FIXME? We could make this more precise by looking at the CFG and | |||
| 2790 | // e.g. not counting loads in each side of an if-then-else diamond. | |||
| 2791 | for (const auto BB : L->blocks()) { | |||
| 2792 | for (auto &I : *BB) { | |||
| 2793 | LoadInst *LMemI = dyn_cast<LoadInst>(&I); | |||
| 2794 | if (!LMemI) | |||
| 2795 | continue; | |||
| 2796 | ||||
| 2797 | Value *PtrValue = LMemI->getPointerOperand(); | |||
| 2798 | if (L->isLoopInvariant(PtrValue)) | |||
| 2799 | continue; | |||
| 2800 | ||||
| 2801 | const SCEV *LSCEV = SE.getSCEV(PtrValue); | |||
| 2802 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); | |||
| 2803 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) | |||
| 2804 | continue; | |||
| 2805 | ||||
| 2806 | // FIXME? We could take pairing of unrolled load copies into account | |||
| 2807 | // by looking at the AddRec, but we would probably have to limit this | |||
| 2808 | // to loops with no stores or other memory optimization barriers. | |||
| 2809 | ++StridedLoads; | |||
| 2810 | // We've seen enough strided loads that seeing more won't make a | |||
| 2811 | // difference. | |||
| 2812 | if (StridedLoads > MaxStridedLoads / 2) | |||
| 2813 | return StridedLoads; | |||
| 2814 | } | |||
| 2815 | } | |||
| 2816 | return StridedLoads; | |||
| 2817 | }; | |||
| 2818 | ||||
| 2819 | int StridedLoads = countStridedLoads(L, SE); | |||
| 2820 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoadsdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: detected " << StridedLoads << " strided loads\n"; } } while (false) | |||
| 2821 | << " strided loads\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: detected " << StridedLoads << " strided loads\n"; } } while (false); | |||
| 2822 | // Pick the largest power of 2 unroll count that won't result in too many | |||
| 2823 | // strided loads. | |||
| 2824 | if (StridedLoads) { | |||
| 2825 | UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads); | |||
| ||||
| 2826 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount << '\n'; } } while (false) | |||
| 2827 | << UP.MaxCount << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("aarch64tti")) { dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount << '\n'; } } while (false); | |||
| 2828 | } | |||
| 2829 | } | |||
| 2830 | ||||
| 2831 | void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, | |||
| 2832 | TTI::UnrollingPreferences &UP, | |||
| 2833 | OptimizationRemarkEmitter *ORE) { | |||
| 2834 | // Enable partial unrolling and runtime unrolling. | |||
| 2835 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); | |||
| 2836 | ||||
| 2837 | UP.UpperBound = true; | |||
| 2838 | ||||
| 2839 | // For inner loop, it is more likely to be a hot one, and the runtime check | |||
| 2840 | // can be promoted out from LICM pass, so the overhead is less, let's try | |||
| 2841 | // a larger threshold to unroll more loops. | |||
| 2842 | if (L->getLoopDepth() > 1) | |||
| ||||
| 2843 | UP.PartialThreshold *= 2; | |||
| 2844 | ||||
| 2845 | // Disable partial & runtime unrolling on -Os. | |||
| 2846 | UP.PartialOptSizeThreshold = 0; | |||
| 2847 | ||||
| 2848 | if (ST->getProcFamily() == AArch64Subtarget::Falkor && | |||
| 2849 | EnableFalkorHWPFUnrollFix) | |||
| 2850 | getFalkorUnrollingPreferences(L, SE, UP); | |||
| 2851 | ||||
| 2852 | // Scan the loop: don't unroll loops with calls as this could prevent | |||
| 2853 | // inlining. Don't unroll vector loops either, as they don't benefit much from | |||
| 2854 | // unrolling. | |||
| 2855 | for (auto *BB : L->getBlocks()) { | |||
| 2856 | for (auto &I : *BB) { | |||
| 2857 | // Don't unroll vectorised loop. | |||
| 2858 | if (I.getType()->isVectorTy()) | |||
| 2859 | return; | |||
| 2860 | ||||
| 2861 | if (isa<CallInst>(I) || isa<InvokeInst>(I)) { | |||
| 2862 | if (const Function *F = cast<CallBase>(I).getCalledFunction()) { | |||
| 2863 | if (!isLoweredToCall(F)) | |||
| 2864 | continue; | |||
| 2865 | } | |||
| 2866 | return; | |||
| 2867 | } | |||
| 2868 | } | |||
| 2869 | } | |||
| 2870 | ||||
| 2871 | // Enable runtime unrolling for in-order models | |||
| 2872 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by | |||
| 2873 | // checking for that case, we can ensure that the default behaviour is | |||
| 2874 | // unchanged | |||
| 2875 | if (ST->getProcFamily() != AArch64Subtarget::Others && | |||
| 2876 | !ST->getSchedModel().isOutOfOrder()) { | |||
| 2877 | UP.Runtime = true; | |||
| 2878 | UP.Partial = true; | |||
| 2879 | UP.UnrollRemainder = true; | |||
| 2880 | UP.DefaultUnrollRuntimeCount = 4; | |||
| 2881 | ||||
| 2882 | UP.UnrollAndJam = true; | |||
| 2883 | UP.UnrollAndJamInnerLoopThreshold = 60; | |||
| 2884 | } | |||
| 2885 | } | |||
| 2886 | ||||
| 2887 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, | |||
| 2888 | TTI::PeelingPreferences &PP) { | |||
| 2889 | BaseT::getPeelingPreferences(L, SE, PP); | |||
| 2890 | } | |||
| 2891 | ||||
| 2892 | Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, | |||
| 2893 | Type *ExpectedType) { | |||
| 2894 | switch (Inst->getIntrinsicID()) { | |||
| 2895 | default: | |||
| 2896 | return nullptr; | |||
| 2897 | case Intrinsic::aarch64_neon_st2: | |||
| 2898 | case Intrinsic::aarch64_neon_st3: | |||
| 2899 | case Intrinsic::aarch64_neon_st4: { | |||
| 2900 | // Create a struct type | |||
| 2901 | StructType *ST = dyn_cast<StructType>(ExpectedType); | |||
| 2902 | if (!ST) | |||
| 2903 | return nullptr; | |||
| 2904 | unsigned NumElts = Inst->arg_size() - 1; | |||
| 2905 | if (ST->getNumElements() != NumElts) | |||
| 2906 | return nullptr; | |||
| 2907 | for (unsigned i = 0, e = NumElts; i != e; ++i) { | |||
| 2908 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(i)) | |||
| 2909 | return nullptr; | |||
| 2910 | } | |||
| 2911 | Value *Res = PoisonValue::get(ExpectedType); | |||
| 2912 | IRBuilder<> Builder(Inst); | |||
| 2913 | for (unsigned i = 0, e = NumElts; i != e; ++i) { | |||
| 2914 | Value *L = Inst->getArgOperand(i); | |||
| 2915 | Res = Builder.CreateInsertValue(Res, L, i); | |||
| 2916 | } | |||
| 2917 | return Res; | |||
| 2918 | } | |||
| 2919 | case Intrinsic::aarch64_neon_ld2: | |||
| 2920 | case Intrinsic::aarch64_neon_ld3: | |||
| 2921 | case Intrinsic::aarch64_neon_ld4: | |||
| 2922 | if (Inst->getType() == ExpectedType) | |||
| 2923 | return Inst; | |||
| 2924 | return nullptr; | |||
| 2925 | } | |||
| 2926 | } | |||
| 2927 | ||||
| 2928 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, | |||
| 2929 | MemIntrinsicInfo &Info) { | |||
| 2930 | switch (Inst->getIntrinsicID()) { | |||
| 2931 | default: | |||
| 2932 | break; | |||
| 2933 | case Intrinsic::aarch64_neon_ld2: | |||
| 2934 | case Intrinsic::aarch64_neon_ld3: | |||
| 2935 | case Intrinsic::aarch64_neon_ld4: | |||
| 2936 | Info.ReadMem = true; | |||
| 2937 | Info.WriteMem = false; | |||
| 2938 | Info.PtrVal = Inst->getArgOperand(0); | |||
| 2939 | break; | |||
| 2940 | case Intrinsic::aarch64_neon_st2: | |||
| 2941 | case Intrinsic::aarch64_neon_st3: | |||
| 2942 | case Intrinsic::aarch64_neon_st4: | |||
| 2943 | Info.ReadMem = false; | |||
| 2944 | Info.WriteMem = true; | |||
| 2945 | Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); | |||
| 2946 | break; | |||
| 2947 | } | |||
| 2948 | ||||
| 2949 | switch (Inst->getIntrinsicID()) { | |||
| 2950 | default: | |||
| 2951 | return false; | |||
| 2952 | case Intrinsic::aarch64_neon_ld2: | |||
| 2953 | case Intrinsic::aarch64_neon_st2: | |||
| 2954 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; | |||
| 2955 | break; | |||
| 2956 | case Intrinsic::aarch64_neon_ld3: | |||
| 2957 | case Intrinsic::aarch64_neon_st3: | |||
| 2958 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; | |||
| 2959 | break; | |||
| 2960 | case Intrinsic::aarch64_neon_ld4: | |||
| 2961 | case Intrinsic::aarch64_neon_st4: | |||
| 2962 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; | |||
| 2963 | break; | |||
| 2964 | } | |||
| 2965 | return true; | |||
| 2966 | } | |||
| 2967 | ||||
| 2968 | /// See if \p I should be considered for address type promotion. We check if \p | |||
| 2969 | /// I is a sext with right type and used in memory accesses. If it used in a | |||
| 2970 | /// "complex" getelementptr, we allow it to be promoted without finding other | |||
| 2971 | /// sext instructions that sign extended the same initial value. A getelementptr | |||
| 2972 | /// is considered as "complex" if it has more than 2 operands. | |||
| 2973 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( | |||
| 2974 | const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { | |||
| 2975 | bool Considerable = false; | |||
| 2976 | AllowPromotionWithoutCommonHeader = false; | |||
| 2977 | if (!isa<SExtInst>(&I)) | |||
| 2978 | return false; | |||
| 2979 | Type *ConsideredSExtType = | |||
| 2980 | Type::getInt64Ty(I.getParent()->getParent()->getContext()); | |||
| 2981 | if (I.getType() != ConsideredSExtType) | |||
| 2982 | return false; | |||
| 2983 | // See if the sext is the one with the right type and used in at least one | |||
| 2984 | // GetElementPtrInst. | |||
| 2985 | for (const User *U : I.users()) { | |||
| 2986 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) { | |||
| 2987 | Considerable = true; | |||
| 2988 | // A getelementptr is considered as "complex" if it has more than 2 | |||
| 2989 | // operands. We will promote a SExt used in such complex GEP as we | |||
| 2990 | // expect some computation to be merged if they are done on 64 bits. | |||
| 2991 | if (GEPInst->getNumOperands() > 2) { | |||
| 2992 | AllowPromotionWithoutCommonHeader = true; | |||
| 2993 | break; | |||
| 2994 | } | |||
| 2995 | } | |||
| 2996 | } | |||
| 2997 | return Considerable; | |||
| 2998 | } | |||
| 2999 | ||||
| 3000 | bool AArch64TTIImpl::isLegalToVectorizeReduction( | |||
| 3001 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { | |||
| 3002 | if (!VF.isScalable()) | |||
| 3003 | return true; | |||
| 3004 | ||||
| 3005 | Type *Ty = RdxDesc.getRecurrenceType(); | |||
| 3006 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) | |||
| 3007 | return false; | |||
| 3008 | ||||
| 3009 | switch (RdxDesc.getRecurrenceKind()) { | |||
| 3010 | case RecurKind::Add: | |||
| 3011 | case RecurKind::FAdd: | |||
| 3012 | case RecurKind::And: | |||
| 3013 | case RecurKind::Or: | |||
| 3014 | case RecurKind::Xor: | |||
| 3015 | case RecurKind::SMin: | |||
| 3016 | case RecurKind::SMax: | |||
| 3017 | case RecurKind::UMin: | |||
| 3018 | case RecurKind::UMax: | |||
| 3019 | case RecurKind::FMin: | |||
| 3020 | case RecurKind::FMax: | |||
| 3021 | case RecurKind::SelectICmp: | |||
| 3022 | case RecurKind::SelectFCmp: | |||
| 3023 | case RecurKind::FMulAdd: | |||
| 3024 | return true; | |||
| 3025 | default: | |||
| 3026 | return false; | |||
| 3027 | } | |||
| 3028 | } | |||
| 3029 | ||||
| 3030 | InstructionCost | |||
| 3031 | AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, | |||
| 3032 | bool IsUnsigned, FastMathFlags FMF, | |||
| 3033 | TTI::TargetCostKind CostKind) { | |||
| 3034 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
| 3035 | ||||
| 3036 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) | |||
| 3037 | return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, FMF, CostKind); | |||
| 3038 | ||||
| 3039 | assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&(static_cast <bool> ((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable" ) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3040 , __extension__ __PRETTY_FUNCTION__)) | |||
| 3040 | "Both vector needs to be equally scalable")(static_cast <bool> ((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && "Both vector needs to be equally scalable" ) ? void (0) : __assert_fail ("(isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) && \"Both vector needs to be equally scalable\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3040 , __extension__ __PRETTY_FUNCTION__)); | |||
| 3041 | ||||
| 3042 | InstructionCost LegalizationCost = 0; | |||
| 3043 | if (LT.first > 1) { | |||
| 3044 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); | |||
| 3045 | Intrinsic::ID MinMaxOpcode = | |||
| 3046 | Ty->isFPOrFPVectorTy() | |||
| 3047 | ? Intrinsic::maxnum | |||
| 3048 | : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin); | |||
| 3049 | IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy}, | |||
| 3050 | FMF); | |||
| 3051 | LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1); | |||
| 3052 | } | |||
| 3053 | ||||
| 3054 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; | |||
| 3055 | } | |||
| 3056 | ||||
| 3057 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( | |||
| 3058 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { | |||
| 3059 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
| 3060 | InstructionCost LegalizationCost = 0; | |||
| 3061 | if (LT.first > 1) { | |||
| 3062 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); | |||
| 3063 | LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); | |||
| 3064 | LegalizationCost *= LT.first - 1; | |||
| 3065 | } | |||
| 3066 | ||||
| 3067 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 3068 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3068 , __extension__ __PRETTY_FUNCTION__)); | |||
| 3069 | // Add the final reduction cost for the legal horizontal reduction | |||
| 3070 | switch (ISD) { | |||
| 3071 | case ISD::ADD: | |||
| 3072 | case ISD::AND: | |||
| 3073 | case ISD::OR: | |||
| 3074 | case ISD::XOR: | |||
| 3075 | case ISD::FADD: | |||
| 3076 | return LegalizationCost + 2; | |||
| 3077 | default: | |||
| 3078 | return InstructionCost::getInvalid(); | |||
| 3079 | } | |||
| 3080 | } | |||
| 3081 | ||||
| 3082 | InstructionCost | |||
| 3083 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
| 3084 | std::optional<FastMathFlags> FMF, | |||
| 3085 | TTI::TargetCostKind CostKind) { | |||
| 3086 | if (TTI::requiresOrderedReduction(FMF)) { | |||
| 3087 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) { | |||
| 3088 | InstructionCost BaseCost = | |||
| 3089 | BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
| 3090 | // Add on extra cost to reflect the extra overhead on some CPUs. We still | |||
| 3091 | // end up vectorizing for more computationally intensive loops. | |||
| 3092 | return BaseCost + FixedVTy->getNumElements(); | |||
| 3093 | } | |||
| 3094 | ||||
| 3095 | if (Opcode != Instruction::FAdd) | |||
| 3096 | return InstructionCost::getInvalid(); | |||
| 3097 | ||||
| 3098 | auto *VTy = cast<ScalableVectorType>(ValTy); | |||
| 3099 | InstructionCost Cost = | |||
| 3100 | getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); | |||
| 3101 | Cost *= getMaxNumElements(VTy->getElementCount()); | |||
| 3102 | return Cost; | |||
| 3103 | } | |||
| 3104 | ||||
| 3105 | if (isa<ScalableVectorType>(ValTy)) | |||
| 3106 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); | |||
| 3107 | ||||
| 3108 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
| 3109 | MVT MTy = LT.second; | |||
| 3110 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 3111 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3111 , __extension__ __PRETTY_FUNCTION__)); | |||
| 3112 | ||||
| 3113 | // Horizontal adds can use the 'addv' instruction. We model the cost of these | |||
| 3114 | // instructions as twice a normal vector add, plus 1 for each legalization | |||
| 3115 | // step (LT.first). This is the only arithmetic vector reduction operation for | |||
| 3116 | // which we have an instruction. | |||
| 3117 | // OR, XOR and AND costs should match the codegen from: | |||
| 3118 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll | |||
| 3119 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll | |||
| 3120 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll | |||
| 3121 | static const CostTblEntry CostTblNoPairwise[]{ | |||
| 3122 | {ISD::ADD, MVT::v8i8, 2}, | |||
| 3123 | {ISD::ADD, MVT::v16i8, 2}, | |||
| 3124 | {ISD::ADD, MVT::v4i16, 2}, | |||
| 3125 | {ISD::ADD, MVT::v8i16, 2}, | |||
| 3126 | {ISD::ADD, MVT::v4i32, 2}, | |||
| 3127 | {ISD::ADD, MVT::v2i64, 2}, | |||
| 3128 | {ISD::OR, MVT::v8i8, 15}, | |||
| 3129 | {ISD::OR, MVT::v16i8, 17}, | |||
| 3130 | {ISD::OR, MVT::v4i16, 7}, | |||
| 3131 | {ISD::OR, MVT::v8i16, 9}, | |||
| 3132 | {ISD::OR, MVT::v2i32, 3}, | |||
| 3133 | {ISD::OR, MVT::v4i32, 5}, | |||
| 3134 | {ISD::OR, MVT::v2i64, 3}, | |||
| 3135 | {ISD::XOR, MVT::v8i8, 15}, | |||
| 3136 | {ISD::XOR, MVT::v16i8, 17}, | |||
| 3137 | {ISD::XOR, MVT::v4i16, 7}, | |||
| 3138 | {ISD::XOR, MVT::v8i16, 9}, | |||
| 3139 | {ISD::XOR, MVT::v2i32, 3}, | |||
| 3140 | {ISD::XOR, MVT::v4i32, 5}, | |||
| 3141 | {ISD::XOR, MVT::v2i64, 3}, | |||
| 3142 | {ISD::AND, MVT::v8i8, 15}, | |||
| 3143 | {ISD::AND, MVT::v16i8, 17}, | |||
| 3144 | {ISD::AND, MVT::v4i16, 7}, | |||
| 3145 | {ISD::AND, MVT::v8i16, 9}, | |||
| 3146 | {ISD::AND, MVT::v2i32, 3}, | |||
| 3147 | {ISD::AND, MVT::v4i32, 5}, | |||
| 3148 | {ISD::AND, MVT::v2i64, 3}, | |||
| 3149 | }; | |||
| 3150 | switch (ISD) { | |||
| 3151 | default: | |||
| 3152 | break; | |||
| 3153 | case ISD::ADD: | |||
| 3154 | if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) | |||
| 3155 | return (LT.first - 1) + Entry->Cost; | |||
| 3156 | break; | |||
| 3157 | case ISD::XOR: | |||
| 3158 | case ISD::AND: | |||
| 3159 | case ISD::OR: | |||
| 3160 | const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); | |||
| 3161 | if (!Entry) | |||
| 3162 | break; | |||
| 3163 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
| 3164 | if (!ValVTy->getElementType()->isIntegerTy(1) && | |||
| 3165 | MTy.getVectorNumElements() <= ValVTy->getNumElements() && | |||
| 3166 | isPowerOf2_32(ValVTy->getNumElements())) { | |||
| 3167 | InstructionCost ExtraCost = 0; | |||
| 3168 | if (LT.first != 1) { | |||
| 3169 | // Type needs to be split, so there is an extra cost of LT.first - 1 | |||
| 3170 | // arithmetic ops. | |||
| 3171 | auto *Ty = FixedVectorType::get(ValTy->getElementType(), | |||
| 3172 | MTy.getVectorNumElements()); | |||
| 3173 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
| 3174 | ExtraCost *= LT.first - 1; | |||
| 3175 | } | |||
| 3176 | return Entry->Cost + ExtraCost; | |||
| 3177 | } | |||
| 3178 | break; | |||
| 3179 | } | |||
| 3180 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
| 3181 | } | |||
| 3182 | ||||
| 3183 | InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { | |||
| 3184 | static const CostTblEntry ShuffleTbl[] = { | |||
| 3185 | { TTI::SK_Splice, MVT::nxv16i8, 1 }, | |||
| 3186 | { TTI::SK_Splice, MVT::nxv8i16, 1 }, | |||
| 3187 | { TTI::SK_Splice, MVT::nxv4i32, 1 }, | |||
| 3188 | { TTI::SK_Splice, MVT::nxv2i64, 1 }, | |||
| 3189 | { TTI::SK_Splice, MVT::nxv2f16, 1 }, | |||
| 3190 | { TTI::SK_Splice, MVT::nxv4f16, 1 }, | |||
| 3191 | { TTI::SK_Splice, MVT::nxv8f16, 1 }, | |||
| 3192 | { TTI::SK_Splice, MVT::nxv2bf16, 1 }, | |||
| 3193 | { TTI::SK_Splice, MVT::nxv4bf16, 1 }, | |||
| 3194 | { TTI::SK_Splice, MVT::nxv8bf16, 1 }, | |||
| 3195 | { TTI::SK_Splice, MVT::nxv2f32, 1 }, | |||
| 3196 | { TTI::SK_Splice, MVT::nxv4f32, 1 }, | |||
| 3197 | { TTI::SK_Splice, MVT::nxv2f64, 1 }, | |||
| 3198 | }; | |||
| 3199 | ||||
| 3200 | // The code-generator is currently not able to handle scalable vectors | |||
| 3201 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting | |||
| 3202 | // it. This change will be removed when code-generation for these types is | |||
| 3203 | // sufficiently reliable. | |||
| 3204 | if (Tp->getElementCount() == ElementCount::getScalable(1)) | |||
| 3205 | return InstructionCost::getInvalid(); | |||
| 3206 | ||||
| 3207 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); | |||
| 3208 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext()); | |||
| 3209 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
| 3210 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 | |||
| 3211 | ? TLI->getPromotedVTForPredicate(EVT(LT.second)) | |||
| 3212 | : LT.second; | |||
| 3213 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext()); | |||
| 3214 | InstructionCost LegalizationCost = 0; | |||
| 3215 | if (Index < 0) { | |||
| 3216 | LegalizationCost = | |||
| 3217 | getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy, | |||
| 3218 | CmpInst::BAD_ICMP_PREDICATE, CostKind) + | |||
| 3219 | getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy, | |||
| 3220 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
| 3221 | } | |||
| 3222 | ||||
| 3223 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp | |||
| 3224 | // Cost performed on a promoted type. | |||
| 3225 | if (LT.second.getScalarType() == MVT::i1) { | |||
| 3226 | LegalizationCost += | |||
| 3227 | getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy, | |||
| 3228 | TTI::CastContextHint::None, CostKind) + | |||
| 3229 | getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy, | |||
| 3230 | TTI::CastContextHint::None, CostKind); | |||
| 3231 | } | |||
| 3232 | const auto *Entry = | |||
| 3233 | CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); | |||
| 3234 | assert(Entry && "Illegal Type for Splice")(static_cast <bool> (Entry && "Illegal Type for Splice" ) ? void (0) : __assert_fail ("Entry && \"Illegal Type for Splice\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3234 , __extension__ __PRETTY_FUNCTION__)); | |||
| 3235 | LegalizationCost += Entry->Cost; | |||
| 3236 | return LegalizationCost * LT.first; | |||
| 3237 | } | |||
| 3238 | ||||
| 3239 | InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
| 3240 | VectorType *Tp, | |||
| 3241 | ArrayRef<int> Mask, | |||
| 3242 | TTI::TargetCostKind CostKind, | |||
| 3243 | int Index, VectorType *SubTp, | |||
| 3244 | ArrayRef<const Value *> Args) { | |||
| 3245 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); | |||
| 3246 | // If we have a Mask, and the LT is being legalized somehow, split the Mask | |||
| 3247 | // into smaller vectors and sum the cost of each shuffle. | |||
| 3248 | if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() && | |||
| 3249 | Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && | |||
| 3250 | cast<FixedVectorType>(Tp)->getNumElements() > | |||
| 3251 | LT.second.getVectorNumElements() && | |||
| 3252 | !Index && !SubTp) { | |||
| 3253 | unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements(); | |||
| 3254 | assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!")(static_cast <bool> (Mask.size() == TpNumElts && "Expected Mask and Tp size to match!") ? void (0) : __assert_fail ("Mask.size() == TpNumElts && \"Expected Mask and Tp size to match!\"" , "llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp", 3254 , __extension__ __PRETTY_FUNCTION__)); | |||
| 3255 | unsigned LTNumElts = LT.second.getVectorNumElements(); | |||
| 3256 | unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; | |||
| 3257 | VectorType *NTp = | |||
| 3258 | VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount()); | |||
| 3259 | InstructionCost Cost; | |||
| 3260 | for (unsigned N = 0; N < NumVecs; N++) { | |||
| 3261 | SmallVector<int> NMask; | |||
| 3262 | // Split the existing mask into chunks of size LTNumElts. Track the source | |||
| 3263 | // sub-vectors to ensure the result has at most 2 inputs. | |||
| 3264 | unsigned Source1, Source2; | |||
| 3265 | unsigned NumSources = 0; | |||
| 3266 | for (unsigned E = 0; E < LTNumElts; E++) { | |||
| 3267 | int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] | |||
| 3268 | : PoisonMaskElem; | |||
| 3269 | if (MaskElt < 0) { | |||
| 3270 | NMask.push_back(PoisonMaskElem); | |||
| 3271 | continue; | |||
| 3272 | } | |||
| 3273 | ||||
| 3274 | // Calculate which source from the input this comes from and whether it | |||
| 3275 | // is new to us. | |||
| 3276 | unsigned Source = MaskElt / LTNumElts; | |||
| 3277 | if (NumSources == 0) { | |||
| 3278 | Source1 = Source; | |||
| 3279 | NumSources = 1; | |||
| 3280 | } else if (NumSources == 1 && Source != Source1) { | |||
| 3281 | Source2 = Source; | |||
| 3282 | NumSources = 2; | |||
| 3283 | } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { | |||
| 3284 | NumSources++; | |||
| 3285 | } | |||
| 3286 | ||||
| 3287 | // Add to the new mask. For the NumSources>2 case these are not correct, | |||
| 3288 | // but are only used for the modular lane number. | |||
| 3289 | if (Source == Source1) | |||
| 3290 | NMask.push_back(MaskElt % LTNumElts); | |||
| 3291 | else if (Source == Source2) | |||
| 3292 | NMask.push_back(MaskElt % LTNumElts + LTNumElts); | |||
| 3293 | else | |||
| 3294 | NMask.push_back(MaskElt % LTNumElts); | |||
| 3295 | } | |||
| 3296 | // If the sub-mask has at most 2 input sub-vectors then re-cost it using | |||
| 3297 | // getShuffleCost. If not then cost it using the worst case. | |||
| 3298 | if (NumSources <= 2) | |||
| 3299 | Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc | |||
| 3300 | : TTI::SK_PermuteTwoSrc, | |||
| 3301 | NTp, NMask, CostKind, 0, nullptr, Args); | |||
| 3302 | else if (any_of(enumerate(NMask), [&](const auto &ME) { | |||
| 3303 | return ME.value() % LTNumElts == ME.index(); | |||
| 3304 | })) | |||
| 3305 | Cost += LTNumElts - 1; | |||
| 3306 | else | |||
| 3307 | Cost += LTNumElts; | |||
| 3308 | } | |||
| 3309 | return Cost; | |||
| 3310 | } | |||
| 3311 | ||||
| 3312 | Kind = improveShuffleKindFromMask(Kind, Mask); | |||
| 3313 | ||||
| 3314 | // Check for broadcast loads, which are supported by the LD1R instruction. | |||
| 3315 | // In terms of code-size, the shuffle vector is free when a load + dup get | |||
| 3316 | // folded into a LD1R. That's what we check and return here. For performance | |||
| 3317 | // and reciprocal throughput, a LD1R is not completely free. In this case, we | |||
| 3318 | // return the cost for the broadcast below (i.e. 1 for most/all types), so | |||
| 3319 | // that we model the load + dup sequence slightly higher because LD1R is a | |||
| 3320 | // high latency instruction. | |||
| 3321 | if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { | |||
| 3322 | bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]); | |||
| 3323 | if (IsLoad && LT.second.isVector() && | |||
| 3324 | isLegalBroadcastLoad(Tp->getElementType(), | |||
| 3325 | LT.second.getVectorElementCount())) | |||
| 3326 | return 0; | |||
| 3327 | } | |||
| 3328 | ||||
| 3329 | // If we have 4 elements for the shuffle and a Mask, get the cost straight | |||
| 3330 | // from the perfect shuffle tables. | |||
| 3331 | if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) && | |||
| 3332 | (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && | |||
| 3333 | all_of(Mask, [](int E) { return E < 8; })) | |||
| 3334 | return getPerfectShuffleCost(Mask); | |||
| 3335 | ||||
| 3336 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || | |||
| 3337 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || | |||
| 3338 | Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { | |||
| 3339 | static const CostTblEntry ShuffleTbl[] = { | |||
| 3340 | // Broadcast shuffle kinds can be performed with 'dup'. | |||
| 3341 | {TTI::SK_Broadcast, MVT::v8i8, 1}, | |||
| 3342 | {TTI::SK_Broadcast, MVT::v16i8, 1}, | |||
| 3343 | {TTI::SK_Broadcast, MVT::v4i16, 1}, | |||
| 3344 | {TTI::SK_Broadcast, MVT::v8i16, 1}, | |||
| 3345 | {TTI::SK_Broadcast, MVT::v2i32, 1}, | |||
| 3346 | {TTI::SK_Broadcast, MVT::v4i32, 1}, | |||
| 3347 | {TTI::SK_Broadcast, MVT::v2i64, 1}, | |||
| 3348 | {TTI::SK_Broadcast, MVT::v4f16, 1}, | |||
| 3349 | {TTI::SK_Broadcast, MVT::v8f16, 1}, | |||
| 3350 | {TTI::SK_Broadcast, MVT::v2f32, 1}, | |||
| 3351 | {TTI::SK_Broadcast, MVT::v4f32, 1}, | |||
| 3352 | {TTI::SK_Broadcast, MVT::v2f64, 1}, | |||
| 3353 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and | |||
| 3354 | // 'zip1/zip2' instructions. | |||
| 3355 | {TTI::SK_Transpose, MVT::v8i8, 1}, | |||
| 3356 | {TTI::SK_Transpose, MVT::v16i8, 1}, | |||
| 3357 | {TTI::SK_Transpose, MVT::v4i16, 1}, | |||
| 3358 | {TTI::SK_Transpose, MVT::v8i16, 1}, | |||
| 3359 | {TTI::SK_Transpose, MVT::v2i32, 1}, | |||
| 3360 | {TTI::SK_Transpose, MVT::v4i32, 1}, | |||
| 3361 | {TTI::SK_Transpose, MVT::v2i64, 1}, | |||
| 3362 | {TTI::SK_Transpose, MVT::v4f16, 1}, | |||
| 3363 | {TTI::SK_Transpose, MVT::v8f16, 1}, | |||
| 3364 | {TTI::SK_Transpose, MVT::v2f32, 1}, | |||
| 3365 | {TTI::SK_Transpose, MVT::v4f32, 1}, | |||
| 3366 | {TTI::SK_Transpose, MVT::v2f64, 1}, | |||
| 3367 | // Select shuffle kinds. | |||
| 3368 | // TODO: handle vXi8/vXi16. | |||
| 3369 | {TTI::SK_Select, MVT::v2i32, 1}, // mov. | |||
| 3370 | {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). | |||
| 3371 | {TTI::SK_Select, MVT::v2i64, 1}, // mov. | |||
| 3372 | {TTI::SK_Select, MVT::v2f32, 1}, // mov. | |||
| 3373 | {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). | |||
| 3374 | {TTI::SK_Select, MVT::v2f64, 1}, // mov. | |||
| 3375 | // PermuteSingleSrc shuffle kinds. | |||
| 3376 | {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. | |||
| 3377 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. | |||
| 3378 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. | |||
| 3379 | {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. | |||
| 3380 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. | |||
| 3381 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. | |||
| 3382 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. | |||
| 3383 | {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. | |||
| 3384 | {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same | |||
| 3385 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl | |||
| 3386 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl | |||
| 3387 | {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl | |||
| 3388 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl | |||
| 3389 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl | |||
| 3390 | // Reverse can be lowered with `rev`. | |||
| 3391 | {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 | |||
| 3392 | {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT | |||
| 3393 | {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT | |||
| 3394 | {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 | |||
| 3395 | {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT | |||
| 3396 | {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT | |||
| 3397 | {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT | |||
| 3398 | {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT | |||
| 3399 | {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT | |||
| 3400 | {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 | |||
| 3401 | {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 | |||
| 3402 | {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 | |||
| 3403 | // Splice can all be lowered as `ext`. | |||
| 3404 | {TTI::SK_Splice, MVT::v2i32, 1}, | |||
| 3405 | {TTI::SK_Splice, MVT::v4i32, 1}, | |||
| 3406 | {TTI::SK_Splice, MVT::v2i64, 1}, | |||
| 3407 | {TTI::SK_Splice, MVT::v2f32, 1}, | |||
| 3408 | {TTI::SK_Splice, MVT::v4f32, 1}, | |||
| 3409 | {TTI::SK_Splice, MVT::v2f64, 1}, | |||
| 3410 | {TTI::SK_Splice, MVT::v8f16, 1}, | |||
| 3411 | {TTI::SK_Splice, MVT::v8bf16, 1}, | |||
| 3412 | {TTI::SK_Splice, MVT::v8i16, 1}, | |||
| 3413 | {TTI::SK_Splice, MVT::v16i8, 1}, | |||
| 3414 | {TTI::SK_Splice, MVT::v4bf16, 1}, | |||
| 3415 | {TTI::SK_Splice, MVT::v4f16, 1}, | |||
| 3416 | {TTI::SK_Splice, MVT::v4i16, 1}, | |||
| 3417 | {TTI::SK_Splice, MVT::v8i8, 1}, | |||
| 3418 | // Broadcast shuffle kinds for scalable vectors | |||
| 3419 | {TTI::SK_Broadcast, MVT::nxv16i8, 1}, | |||
| 3420 | {TTI::SK_Broadcast, MVT::nxv8i16, 1}, | |||
| 3421 | {TTI::SK_Broadcast, MVT::nxv4i32, 1}, | |||
| 3422 | {TTI::SK_Broadcast, MVT::nxv2i64, 1}, | |||
| 3423 | {TTI::SK_Broadcast, MVT::nxv2f16, 1}, | |||
| 3424 | {TTI::SK_Broadcast, MVT::nxv4f16, 1}, | |||
| 3425 | {TTI::SK_Broadcast, MVT::nxv8f16, 1}, | |||
| 3426 | {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, | |||
| 3427 | {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, | |||
| 3428 | {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, | |||
| 3429 | {TTI::SK_Broadcast, MVT::nxv2f32, 1}, | |||
| 3430 | {TTI::SK_Broadcast, MVT::nxv4f32, 1}, | |||
| 3431 | {TTI::SK_Broadcast, MVT::nxv2f64, 1}, | |||
| 3432 | {TTI::SK_Broadcast, MVT::nxv16i1, 1}, | |||
| 3433 | {TTI::SK_Broadcast, MVT::nxv8i1, 1}, | |||
| 3434 | {TTI::SK_Broadcast, MVT::nxv4i1, 1}, | |||
| 3435 | {TTI::SK_Broadcast, MVT::nxv2i1, 1}, | |||
| 3436 | // Handle the cases for vector.reverse with scalable vectors | |||
| 3437 | {TTI::SK_Reverse, MVT::nxv16i8, 1}, | |||
| 3438 | {TTI::SK_Reverse, MVT::nxv8i16, 1}, | |||
| 3439 | {TTI::SK_Reverse, MVT::nxv4i32, 1}, | |||
| 3440 | {TTI::SK_Reverse, MVT::nxv2i64, 1}, | |||
| 3441 | {TTI::SK_Reverse, MVT::nxv2f16, 1}, | |||
| 3442 | {TTI::SK_Reverse, MVT::nxv4f16, 1}, | |||
| 3443 | {TTI::SK_Reverse, MVT::nxv8f16, 1}, | |||
| 3444 | {TTI::SK_Reverse, MVT::nxv2bf16, 1}, | |||
| 3445 | {TTI::SK_Reverse, MVT::nxv4bf16, 1}, | |||
| 3446 | {TTI::SK_Reverse, MVT::nxv8bf16, 1}, | |||
| 3447 | {TTI::SK_Reverse, MVT::nxv2f32, 1}, | |||
| 3448 | {TTI::SK_Reverse, MVT::nxv4f32, 1}, | |||
| 3449 | {TTI::SK_Reverse, MVT::nxv2f64, 1}, | |||
| 3450 | {TTI::SK_Reverse, MVT::nxv16i1, 1}, | |||
| 3451 | {TTI::SK_Reverse, MVT::nxv8i1, 1}, | |||
| 3452 | {TTI::SK_Reverse, MVT::nxv4i1, 1}, | |||
| 3453 | {TTI::SK_Reverse, MVT::nxv2i1, 1}, | |||
| 3454 | }; | |||
| 3455 | if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) | |||
| 3456 | return LT.first * Entry->Cost; | |||
| 3457 | } | |||
| 3458 | ||||
| 3459 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp)) | |||
| 3460 | return getSpliceCost(Tp, Index); | |||
| 3461 | ||||
| 3462 | // Inserting a subvector can often be done with either a D, S or H register | |||
| 3463 | // move, so long as the inserted vector is "aligned". | |||
| 3464 | if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && | |||
| 3465 | LT.second.getSizeInBits() <= 128 && SubTp) { | |||
| 3466 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
| 3467 | if (SubLT.second.isVector()) { | |||
| 3468 | int NumElts = LT.second.getVectorNumElements(); | |||
| 3469 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
| 3470 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
| 3471 | return SubLT.first; | |||
| 3472 | } | |||
| 3473 | } | |||
| 3474 | ||||
| 3475 | return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); | |||
| 3476 | } | |||
| 3477 | ||||
| 3478 | static bool containsDecreasingPointers(Loop *TheLoop, | |||
| 3479 | PredicatedScalarEvolution *PSE) { | |||
| 3480 | const ValueToValueMap &Strides = ValueToValueMap(); | |||
| 3481 | for (BasicBlock *BB : TheLoop->blocks()) { | |||
| 3482 | // Scan the instructions in the block and look for addresses that are | |||
| 3483 | // consecutive and decreasing. | |||
| 3484 | for (Instruction &I : *BB) { | |||
| 3485 | if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) { | |||
| 3486 | Value *Ptr = getLoadStorePointerOperand(&I); | |||
| 3487 | Type *AccessTy = getLoadStoreType(&I); | |||
| 3488 | if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true, | |||
| 3489 | /*ShouldCheckWrap=*/false) | |||
| 3490 | .value_or(0) < 0) | |||
| 3491 | return true; | |||
| 3492 | } | |||
| 3493 | } | |||
| 3494 | } | |||
| 3495 | return false; | |||
| 3496 | } | |||
| 3497 | ||||
| 3498 | bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { | |||
| 3499 | if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) | |||
| 3500 | return false; | |||
| 3501 | ||||
| 3502 | // We don't currently support vectorisation with interleaving for SVE - with | |||
| 3503 | // such loops we're better off not using tail-folding. This gives us a chance | |||
| 3504 | // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. | |||
| 3505 | if (TFI->IAI->hasGroups()) | |||
| 3506 | return false; | |||
| 3507 | ||||
| 3508 | TailFoldingKind Required; // Defaults to 0. | |||
| 3509 | if (TFI->LVL->getReductionVars().size()) | |||
| 3510 | Required.add(TailFoldingKind::TFReductions); | |||
| 3511 | if (TFI->LVL->getFixedOrderRecurrences().size()) | |||
| 3512 | Required.add(TailFoldingKind::TFRecurrences); | |||
| 3513 | ||||
| 3514 | // We call this to discover whether any load/store pointers in the loop have | |||
| 3515 | // negative strides. This will require extra work to reverse the loop | |||
| 3516 | // predicate, which may be expensive. | |||
| 3517 | if (containsDecreasingPointers(TFI->LVL->getLoop(), | |||
| 3518 | TFI->LVL->getPredicatedScalarEvolution())) | |||
| 3519 | Required.add(TailFoldingKind::TFReverse); | |||
| 3520 | if (!Required) | |||
| 3521 | Required.add(TailFoldingKind::TFSimple); | |||
| 3522 | ||||
| 3523 | return (TailFoldingKindLoc & Required) == Required; | |||
| 3524 | } | |||
| 3525 | ||||
| 3526 | InstructionCost | |||
| 3527 | AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, | |||
| 3528 | int64_t BaseOffset, bool HasBaseReg, | |||
| 3529 | int64_t Scale, unsigned AddrSpace) const { | |||
| 3530 | // Scaling factors are not free at all. | |||
| 3531 | // Operands | Rt Latency | |||
| 3532 | // ------------------------------------------- | |||
| 3533 | // Rt, [Xn, Xm] | 4 | |||
| 3534 | // ------------------------------------------- | |||
| 3535 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 | |||
| 3536 | // Rt, [Xn, Wm, <extend> #imm] | | |||
| 3537 | TargetLoweringBase::AddrMode AM; | |||
| 3538 | AM.BaseGV = BaseGV; | |||
| 3539 | AM.BaseOffs = BaseOffset; | |||
| 3540 | AM.HasBaseReg = HasBaseReg; | |||
| 3541 | AM.Scale = Scale; | |||
| 3542 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) | |||
| 3543 | // Scale represents reg2 * scale, thus account for 1 if | |||
| 3544 | // it is not equal to 0 or 1. | |||
| 3545 | return AM.Scale != 0 && AM.Scale != 1; | |||
| 3546 | return -1; | |||
| 3547 | } |
| 1 | //===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file contains some functions that are useful for math stuff. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #ifndef LLVM_SUPPORT_MATHEXTRAS_H |
| 14 | #define LLVM_SUPPORT_MATHEXTRAS_H |
| 15 | |
| 16 | #include "llvm/ADT/bit.h" |
| 17 | #include "llvm/Support/Compiler.h" |
| 18 | #include <cassert> |
| 19 | #include <climits> |
| 20 | #include <cstdint> |
| 21 | #include <cstring> |
| 22 | #include <limits> |
| 23 | #include <type_traits> |
| 24 | |
| 25 | namespace llvm { |
| 26 | |
| 27 | /// Mathematical constants. |
| 28 | namespace numbers { |
| 29 | // TODO: Track C++20 std::numbers. |
| 30 | // TODO: Favor using the hexadecimal FP constants (requires C++17). |
| 31 | constexpr double e = 2.7182818284590452354, // (0x1.5bf0a8b145749P+1) https://oeis.org/A001113 |
| 32 | egamma = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620 |
| 33 | ln2 = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162 |
| 34 | ln10 = 2.3025850929940456840, // (0x1.24bb1bbb55516P+1) https://oeis.org/A002392 |
| 35 | log2e = 1.4426950408889634074, // (0x1.71547652b82feP+0) |
| 36 | log10e = .43429448190325182765, // (0x1.bcb7b1526e50eP-2) |
| 37 | pi = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796 |
| 38 | inv_pi = .31830988618379067154, // (0x1.45f306bc9c883P-2) https://oeis.org/A049541 |
| 39 | sqrtpi = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161 |
| 40 | inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197 |
| 41 | sqrt2 = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219 |
| 42 | inv_sqrt2 = .70710678118654752440, // (0x1.6a09e667f3bcdP-1) |
| 43 | sqrt3 = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194 |
| 44 | inv_sqrt3 = .57735026918962576451, // (0x1.279a74590331cP-1) |
| 45 | phi = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622 |
| 46 | constexpr float ef = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113 |
| 47 | egammaf = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620 |
| 48 | ln2f = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162 |
| 49 | ln10f = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392 |
| 50 | log2ef = 1.44269504F, // (0x1.715476P+0) |
| 51 | log10ef = .434294482F, // (0x1.bcb7b2P-2) |
| 52 | pif = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796 |
| 53 | inv_pif = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541 |
| 54 | sqrtpif = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161 |
| 55 | inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197 |
| 56 | sqrt2f = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193 |
| 57 | inv_sqrt2f = .707106781F, // (0x1.6a09e6P-1) |
| 58 | sqrt3f = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194 |
| 59 | inv_sqrt3f = .577350269F, // (0x1.279a74P-1) |
| 60 | phif = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622 |
| 61 | } // namespace numbers |
| 62 | |
| 63 | /// Count number of 0's from the least significant bit to the most |
| 64 | /// stopping at the first 1. |
| 65 | /// |
| 66 | /// Only unsigned integral types are allowed. |
| 67 | /// |
| 68 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
| 69 | template <typename T> |
| 70 | LLVM_DEPRECATED("Use llvm::countr_zero instead.", "llvm::countr_zero")__attribute__((deprecated("Use llvm::countr_zero instead.", "llvm::countr_zero" ))) |
| 71 | unsigned countTrailingZeros(T Val) { |
| 72 | static_assert(std::is_unsigned_v<T>, |
| 73 | "Only unsigned integral types are allowed."); |
| 74 | return llvm::countr_zero(Val); |
| 75 | } |
| 76 | |
| 77 | /// Count number of 0's from the most significant bit to the least |
| 78 | /// stopping at the first 1. |
| 79 | /// |
| 80 | /// Only unsigned integral types are allowed. |
| 81 | /// |
| 82 | /// Returns std::numeric_limits<T>::digits on an input of 0. |
| 83 | template <typename T> |
| 84 | LLVM_DEPRECATED("Use llvm::countl_zero instead.", "llvm::countl_zero")__attribute__((deprecated("Use llvm::countl_zero instead.", "llvm::countl_zero" ))) |
| 85 | unsigned countLeadingZeros(T Val) { |
| 86 | static_assert(std::is_unsigned_v<T>, |
| 87 | "Only unsigned integral types are allowed."); |
| 88 | return llvm::countl_zero(Val); |
| 89 | } |
| 90 | |
| 91 | /// Create a bitmask with the N right-most bits set to 1, and all other |
| 92 | /// bits set to 0. Only unsigned types are allowed. |
| 93 | template <typename T> T maskTrailingOnes(unsigned N) { |
| 94 | static_assert(std::is_unsigned_v<T>, "Invalid type!"); |
| 95 | const unsigned Bits = CHAR_BIT8 * sizeof(T); |
| 96 | assert(N <= Bits && "Invalid bit index")(static_cast <bool> (N <= Bits && "Invalid bit index" ) ? void (0) : __assert_fail ("N <= Bits && \"Invalid bit index\"" , "llvm/include/llvm/Support/MathExtras.h", 96, __extension__ __PRETTY_FUNCTION__)); |
| 97 | return N == 0 ? 0 : (T(-1) >> (Bits - N)); |
| 98 | } |
| 99 | |
| 100 | /// Create a bitmask with the N left-most bits set to 1, and all other |
| 101 | /// bits set to 0. Only unsigned types are allowed. |
| 102 | template <typename T> T maskLeadingOnes(unsigned N) { |
| 103 | return ~maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
| 104 | } |
| 105 | |
| 106 | /// Create a bitmask with the N right-most bits set to 0, and all other |
| 107 | /// bits set to 1. Only unsigned types are allowed. |
| 108 | template <typename T> T maskTrailingZeros(unsigned N) { |
| 109 | return maskLeadingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
| 110 | } |
| 111 | |
| 112 | /// Create a bitmask with the N left-most bits set to 0, and all other |
| 113 | /// bits set to 1. Only unsigned types are allowed. |
| 114 | template <typename T> T maskLeadingZeros(unsigned N) { |
| 115 | return maskTrailingOnes<T>(CHAR_BIT8 * sizeof(T) - N); |
| 116 | } |
| 117 | |
| 118 | /// Macro compressed bit reversal table for 256 bits. |
| 119 | /// |
| 120 | /// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable |
| 121 | static const unsigned char BitReverseTable256[256] = { |
| 122 | #define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 |
| 123 | #define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) |
| 124 | #define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) |
| 125 | R6(0), R6(2), R6(1), R6(3) |
| 126 | #undef R2 |
| 127 | #undef R4 |
| 128 | #undef R6 |
| 129 | }; |
| 130 | |
| 131 | /// Reverse the bits in \p Val. |
| 132 | template <typename T> T reverseBits(T Val) { |
| 133 | #if __has_builtin(__builtin_bitreverse8)1 |
| 134 | if constexpr (std::is_same_v<T, uint8_t>) |
| 135 | return __builtin_bitreverse8(Val); |
| 136 | #endif |
| 137 | #if __has_builtin(__builtin_bitreverse16)1 |
| 138 | if constexpr (std::is_same_v<T, uint16_t>) |
| 139 | return __builtin_bitreverse16(Val); |
| 140 | #endif |
| 141 | #if __has_builtin(__builtin_bitreverse32)1 |
| 142 | if constexpr (std::is_same_v<T, uint32_t>) |
| 143 | return __builtin_bitreverse32(Val); |
| 144 | #endif |
| 145 | #if __has_builtin(__builtin_bitreverse64)1 |
| 146 | if constexpr (std::is_same_v<T, uint64_t>) |
| 147 | return __builtin_bitreverse64(Val); |
| 148 | #endif |
| 149 | |
| 150 | unsigned char in[sizeof(Val)]; |
| 151 | unsigned char out[sizeof(Val)]; |
| 152 | std::memcpy(in, &Val, sizeof(Val)); |
| 153 | for (unsigned i = 0; i < sizeof(Val); ++i) |
| 154 | out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; |
| 155 | std::memcpy(&Val, out, sizeof(Val)); |
| 156 | return Val; |
| 157 | } |
| 158 | |
| 159 | // NOTE: The following support functions use the _32/_64 extensions instead of |
| 160 | // type overloading so that signed and unsigned integers can be used without |
| 161 | // ambiguity. |
| 162 | |
| 163 | /// Return the high 32 bits of a 64 bit value. |
| 164 | constexpr inline uint32_t Hi_32(uint64_t Value) { |
| 165 | return static_cast<uint32_t>(Value >> 32); |
| 166 | } |
| 167 | |
| 168 | /// Return the low 32 bits of a 64 bit value. |
| 169 | constexpr inline uint32_t Lo_32(uint64_t Value) { |
| 170 | return static_cast<uint32_t>(Value); |
| 171 | } |
| 172 | |
| 173 | /// Make a 64-bit integer from a high / low pair of 32-bit integers. |
| 174 | constexpr inline uint64_t Make_64(uint32_t High, uint32_t Low) { |
| 175 | return ((uint64_t)High << 32) | (uint64_t)Low; |
| 176 | } |
| 177 | |
| 178 | /// Checks if an integer fits into the given bit width. |
| 179 | template <unsigned N> constexpr inline bool isInt(int64_t x) { |
| 180 | if constexpr (N == 8) |
| 181 | return static_cast<int8_t>(x) == x; |
| 182 | if constexpr (N == 16) |
| 183 | return static_cast<int16_t>(x) == x; |
| 184 | if constexpr (N == 32) |
| 185 | return static_cast<int32_t>(x) == x; |
| 186 | if constexpr (N < 64) |
| 187 | return -(INT64_C(1)1L << (N - 1)) <= x && x < (INT64_C(1)1L << (N - 1)); |
| 188 | (void)x; // MSVC v19.25 warns that x is unused. |
| 189 | return true; |
| 190 | } |
| 191 | |
| 192 | /// Checks if a signed integer is an N bit number shifted left by S. |
| 193 | template <unsigned N, unsigned S> |
| 194 | constexpr inline bool isShiftedInt(int64_t x) { |
| 195 | static_assert( |
| 196 | N > 0, "isShiftedInt<0> doesn't make sense (refers to a 0-bit number."); |
| 197 | static_assert(N + S <= 64, "isShiftedInt<N, S> with N + S > 64 is too wide."); |
| 198 | return isInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); |
| 199 | } |
| 200 | |
| 201 | /// Checks if an unsigned integer fits into the given bit width. |
| 202 | template <unsigned N> constexpr inline bool isUInt(uint64_t x) { |
| 203 | static_assert(N > 0, "isUInt<0> doesn't make sense"); |
| 204 | if constexpr (N == 8) |
| 205 | return static_cast<uint8_t>(x) == x; |
| 206 | if constexpr (N == 16) |
| 207 | return static_cast<uint16_t>(x) == x; |
| 208 | if constexpr (N == 32) |
| 209 | return static_cast<uint32_t>(x) == x; |
| 210 | if constexpr (N < 64) |
| 211 | return x < (UINT64_C(1)1UL << (N)); |
| 212 | (void)x; // MSVC v19.25 warns that x is unused. |
| 213 | return true; |
| 214 | } |
| 215 | |
| 216 | /// Checks if a unsigned integer is an N bit number shifted left by S. |
| 217 | template <unsigned N, unsigned S> |
| 218 | constexpr inline bool isShiftedUInt(uint64_t x) { |
| 219 | static_assert( |
| 220 | N > 0, "isShiftedUInt<0> doesn't make sense (refers to a 0-bit number)"); |
| 221 | static_assert(N + S <= 64, |
| 222 | "isShiftedUInt<N, S> with N + S > 64 is too wide."); |
| 223 | // Per the two static_asserts above, S must be strictly less than 64. So |
| 224 | // 1 << S is not undefined behavior. |
| 225 | return isUInt<N + S>(x) && (x % (UINT64_C(1)1UL << S) == 0); |
| 226 | } |
| 227 | |
| 228 | /// Gets the maximum value for a N-bit unsigned integer. |
| 229 | inline uint64_t maxUIntN(uint64_t N) { |
| 230 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 230, __extension__ __PRETTY_FUNCTION__)); |
| 231 | |
| 232 | // uint64_t(1) << 64 is undefined behavior, so we can't do |
| 233 | // (uint64_t(1) << N) - 1 |
| 234 | // without checking first that N != 64. But this works and doesn't have a |
| 235 | // branch. |
| 236 | return UINT64_MAX(18446744073709551615UL) >> (64 - N); |
| 237 | } |
| 238 | |
| 239 | /// Gets the minimum value for a N-bit signed integer. |
| 240 | inline int64_t minIntN(int64_t N) { |
| 241 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 241, __extension__ __PRETTY_FUNCTION__)); |
| 242 | |
| 243 | return UINT64_C(1)1UL + ~(UINT64_C(1)1UL << (N - 1)); |
| 244 | } |
| 245 | |
| 246 | /// Gets the maximum value for a N-bit signed integer. |
| 247 | inline int64_t maxIntN(int64_t N) { |
| 248 | assert(N > 0 && N <= 64 && "integer width out of range")(static_cast <bool> (N > 0 && N <= 64 && "integer width out of range") ? void (0) : __assert_fail ("N > 0 && N <= 64 && \"integer width out of range\"" , "llvm/include/llvm/Support/MathExtras.h", 248, __extension__ __PRETTY_FUNCTION__)); |
| 249 | |
| 250 | // This relies on two's complement wraparound when N == 64, so we convert to |
| 251 | // int64_t only at the very end to avoid UB. |
| 252 | return (UINT64_C(1)1UL << (N - 1)) - 1; |
| 253 | } |
| 254 | |
| 255 | /// Checks if an unsigned integer fits into the given (dynamic) bit width. |
| 256 | inline bool isUIntN(unsigned N, uint64_t x) { |
| 257 | return N >= 64 || x <= maxUIntN(N); |
| 258 | } |
| 259 | |
| 260 | /// Checks if an signed integer fits into the given (dynamic) bit width. |
| 261 | inline bool isIntN(unsigned N, int64_t x) { |
| 262 | return N >= 64 || (minIntN(N) <= x && x <= maxIntN(N)); |
| 263 | } |
| 264 | |
| 265 | /// Return true if the argument is a non-empty sequence of ones starting at the |
| 266 | /// least significant bit with the remainder zero (32 bit version). |
| 267 | /// Ex. isMask_32(0x0000FFFFU) == true. |
| 268 | constexpr inline bool isMask_32(uint32_t Value) { |
| 269 | return Value && ((Value + 1) & Value) == 0; |
| 270 | } |
| 271 | |
| 272 | /// Return true if the argument is a non-empty sequence of ones starting at the |
| 273 | /// least significant bit with the remainder zero (64 bit version). |
| 274 | constexpr inline bool isMask_64(uint64_t Value) { |
| 275 | return Value && ((Value + 1) & Value) == 0; |
| 276 | } |
| 277 | |
| 278 | /// Return true if the argument contains a non-empty sequence of ones with the |
| 279 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. |
| 280 | constexpr inline bool isShiftedMask_32(uint32_t Value) { |
| 281 | return Value && isMask_32((Value - 1) | Value); |
| 282 | } |
| 283 | |
| 284 | /// Return true if the argument contains a non-empty sequence of ones with the |
| 285 | /// remainder zero (64 bit version.) |
| 286 | constexpr inline bool isShiftedMask_64(uint64_t Value) { |
| 287 | return Value && isMask_64((Value - 1) | Value); |
| 288 | } |
| 289 | |
| 290 | /// Return true if the argument is a power of two > 0. |
| 291 | /// Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) |
| 292 | constexpr inline bool isPowerOf2_32(uint32_t Value) { |
| 293 | return llvm::has_single_bit(Value); |
| 294 | } |
| 295 | |
| 296 | /// Return true if the argument is a power of two > 0 (64 bit edition.) |
| 297 | constexpr inline bool isPowerOf2_64(uint64_t Value) { |
| 298 | return llvm::has_single_bit(Value); |
| 299 | } |
| 300 | |
| 301 | /// Count the number of ones from the most significant bit to the first |
| 302 | /// zero bit. |
| 303 | /// |
| 304 | /// Ex. countLeadingOnes(0xFF0FFF00) == 8. |
| 305 | /// Only unsigned integral types are allowed. |
| 306 | /// |
| 307 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
| 308 | template <typename T> |
| 309 | LLVM_DEPRECATED("Use llvm::countl_one instead.", "llvm::countl_one")__attribute__((deprecated("Use llvm::countl_one instead.", "llvm::countl_one" ))) |
| 310 | unsigned countLeadingOnes(T Value) { |
| 311 | static_assert(std::is_unsigned_v<T>, |
| 312 | "Only unsigned integral types are allowed."); |
| 313 | return llvm::countl_one<T>(Value); |
| 314 | } |
| 315 | |
| 316 | /// Count the number of ones from the least significant bit to the first |
| 317 | /// zero bit. |
| 318 | /// |
| 319 | /// Ex. countTrailingOnes(0x00FF00FF) == 8. |
| 320 | /// Only unsigned integral types are allowed. |
| 321 | /// |
| 322 | /// Returns std::numeric_limits<T>::digits on an input of all ones. |
| 323 | template <typename T> |
| 324 | LLVM_DEPRECATED("Use llvm::countr_one instead.", "llvm::countr_one")__attribute__((deprecated("Use llvm::countr_one instead.", "llvm::countr_one" ))) |
| 325 | unsigned countTrailingOnes(T Value) { |
| 326 | static_assert(std::is_unsigned_v<T>, |
| 327 | "Only unsigned integral types are allowed."); |
| 328 | return llvm::countr_one<T>(Value); |
| 329 | } |
| 330 | |
| 331 | /// Count the number of set bits in a value. |
| 332 | /// Ex. countPopulation(0xF000F000) = 8 |
| 333 | /// Returns 0 if the word is zero. |
| 334 | template <typename T> |
| 335 | LLVM_DEPRECATED("Use llvm::popcount instead.", "llvm::popcount")__attribute__((deprecated("Use llvm::popcount instead.", "llvm::popcount" ))) |
| 336 | inline unsigned countPopulation(T Value) { |
| 337 | static_assert(std::is_unsigned_v<T>, |
| 338 | "Only unsigned integral types are allowed."); |
| 339 | return (unsigned)llvm::popcount(Value); |
| 340 | } |
| 341 | |
| 342 | /// Return true if the argument contains a non-empty sequence of ones with the |
| 343 | /// remainder zero (32 bit version.) Ex. isShiftedMask_32(0x0000FF00U) == true. |
| 344 | /// If true, \p MaskIdx will specify the index of the lowest set bit and \p |
| 345 | /// MaskLen is updated to specify the length of the mask, else neither are |
| 346 | /// updated. |
| 347 | inline bool isShiftedMask_32(uint32_t Value, unsigned &MaskIdx, |
| 348 | unsigned &MaskLen) { |
| 349 | if (!isShiftedMask_32(Value)) |
| 350 | return false; |
| 351 | MaskIdx = llvm::countr_zero(Value); |
| 352 | MaskLen = llvm::popcount(Value); |
| 353 | return true; |
| 354 | } |
| 355 | |
| 356 | /// Return true if the argument contains a non-empty sequence of ones with the |
| 357 | /// remainder zero (64 bit version.) If true, \p MaskIdx will specify the index |
| 358 | /// of the lowest set bit and \p MaskLen is updated to specify the length of the |
| 359 | /// mask, else neither are updated. |
| 360 | inline bool isShiftedMask_64(uint64_t Value, unsigned &MaskIdx, |
| 361 | unsigned &MaskLen) { |
| 362 | if (!isShiftedMask_64(Value)) |
| 363 | return false; |
| 364 | MaskIdx = llvm::countr_zero(Value); |
| 365 | MaskLen = llvm::popcount(Value); |
| 366 | return true; |
| 367 | } |
| 368 | |
| 369 | /// Compile time Log2. |
| 370 | /// Valid only for positive powers of two. |
| 371 | template <size_t kValue> constexpr inline size_t CTLog2() { |
| 372 | static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), |
| 373 | "Value is not a valid power of 2"); |
| 374 | return 1 + CTLog2<kValue / 2>(); |
| 375 | } |
| 376 | |
| 377 | template <> constexpr inline size_t CTLog2<1>() { return 0; } |
| 378 | |
| 379 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. |
| 380 | /// (32 bit edition.) |
| 381 | /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 |
| 382 | inline unsigned Log2_32(uint32_t Value) { |
| 383 | return 31 - llvm::countl_zero(Value); |
| 384 | } |
| 385 | |
| 386 | /// Return the floor log base 2 of the specified value, -1 if the value is zero. |
| 387 | /// (64 bit edition.) |
| 388 | inline unsigned Log2_64(uint64_t Value) { |
| 389 | return 63 - llvm::countl_zero(Value); |
| 390 | } |
| 391 | |
| 392 | /// Return the ceil log base 2 of the specified value, 32 if the value is zero. |
| 393 | /// (32 bit edition). |
| 394 | /// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 |
| 395 | inline unsigned Log2_32_Ceil(uint32_t Value) { |
| 396 | return 32 - llvm::countl_zero(Value - 1); |
| 397 | } |
| 398 | |
| 399 | /// Return the ceil log base 2 of the specified value, 64 if the value is zero. |
| 400 | /// (64 bit edition.) |
| 401 | inline unsigned Log2_64_Ceil(uint64_t Value) { |
| 402 | return 64 - llvm::countl_zero(Value - 1); |
| 403 | } |
| 404 | |
| 405 | /// This function takes a 64-bit integer and returns the bit equivalent double. |
| 406 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<double>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<double>" ))) |
| 407 | inline double BitsToDouble(uint64_t Bits) { |
| 408 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); |
| 409 | return llvm::bit_cast<double>(Bits); |
| 410 | } |
| 411 | |
| 412 | /// This function takes a 32-bit integer and returns the bit equivalent float. |
| 413 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<float>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<float>" ))) |
| 414 | inline float BitsToFloat(uint32_t Bits) { |
| 415 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); |
| 416 | return llvm::bit_cast<float>(Bits); |
| 417 | } |
| 418 | |
| 419 | /// This function takes a double and returns the bit equivalent 64-bit integer. |
| 420 | /// Note that copying doubles around changes the bits of NaNs on some hosts, |
| 421 | /// notably x86, so this routine cannot be used if these bits are needed. |
| 422 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint64_t>" ))) |
| 423 | inline uint64_t DoubleToBits(double Double) { |
| 424 | static_assert(sizeof(uint64_t) == sizeof(double), "Unexpected type sizes"); |
| 425 | return llvm::bit_cast<uint64_t>(Double); |
| 426 | } |
| 427 | |
| 428 | /// This function takes a float and returns the bit equivalent 32-bit integer. |
| 429 | /// Note that copying floats around changes the bits of NaNs on some hosts, |
| 430 | /// notably x86, so this routine cannot be used if these bits are needed. |
| 431 | LLVM_DEPRECATED("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>")__attribute__((deprecated("use llvm::bit_cast instead", "llvm::bit_cast<uint32_t>" ))) |
| 432 | inline uint32_t FloatToBits(float Float) { |
| 433 | static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected type sizes"); |
| 434 | return llvm::bit_cast<uint32_t>(Float); |
| 435 | } |
| 436 | |
| 437 | /// A and B are either alignments or offsets. Return the minimum alignment that |
| 438 | /// may be assumed after adding the two together. |
| 439 | constexpr inline uint64_t MinAlign(uint64_t A, uint64_t B) { |
| 440 | // The largest power of 2 that divides both A and B. |
| 441 | // |
| 442 | // Replace "-Value" by "1+~Value" in the following commented code to avoid |
| 443 | // MSVC warning C4146 |
| 444 | // return (A | B) & -(A | B); |
| 445 | return (A | B) & (1 + ~(A | B)); |
| 446 | } |
| 447 | |
| 448 | /// Returns the next power of two (in 64-bits) that is strictly greater than A. |
| 449 | /// Returns zero on overflow. |
| 450 | constexpr inline uint64_t NextPowerOf2(uint64_t A) { |
| 451 | A |= (A >> 1); |
| 452 | A |= (A >> 2); |
| 453 | A |= (A >> 4); |
| 454 | A |= (A >> 8); |
| 455 | A |= (A >> 16); |
| 456 | A |= (A >> 32); |
| 457 | return A + 1; |
| 458 | } |
| 459 | |
| 460 | /// Returns the power of two which is less than or equal to the given value. |
| 461 | /// Essentially, it is a floor operation across the domain of powers of two. |
| 462 | LLVM_DEPRECATED("use llvm::bit_floor instead", "llvm::bit_floor")__attribute__((deprecated("use llvm::bit_floor instead", "llvm::bit_floor" ))) |
| 463 | inline uint64_t PowerOf2Floor(uint64_t A) { |
| 464 | return llvm::bit_floor(A); |
| 465 | } |
| 466 | |
| 467 | /// Returns the power of two which is greater than or equal to the given value. |
| 468 | /// Essentially, it is a ceil operation across the domain of powers of two. |
| 469 | inline uint64_t PowerOf2Ceil(uint64_t A) { |
| 470 | if (!A) |
| 471 | return 0; |
| 472 | return NextPowerOf2(A - 1); |
| 473 | } |
| 474 | |
| 475 | /// Returns the next integer (mod 2**64) that is greater than or equal to |
| 476 | /// \p Value and is a multiple of \p Align. \p Align must be non-zero. |
| 477 | /// |
| 478 | /// Examples: |
| 479 | /// \code |
| 480 | /// alignTo(5, 8) = 8 |
| 481 | /// alignTo(17, 8) = 24 |
| 482 | /// alignTo(~0LL, 8) = 0 |
| 483 | /// alignTo(321, 255) = 510 |
| 484 | /// \endcode |
| 485 | inline uint64_t alignTo(uint64_t Value, uint64_t Align) { |
| 486 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 486, __extension__ __PRETTY_FUNCTION__)); |
| 487 | return (Value + Align - 1) / Align * Align; |
| 488 | } |
| 489 | |
| 490 | inline uint64_t alignToPowerOf2(uint64_t Value, uint64_t Align) { |
| 491 | assert(Align != 0 && (Align & (Align - 1)) == 0 &&(static_cast <bool> (Align != 0 && (Align & (Align - 1)) == 0 && "Align must be a power of 2") ? void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\"" , "llvm/include/llvm/Support/MathExtras.h", 492, __extension__ __PRETTY_FUNCTION__)) |
| 492 | "Align must be a power of 2")(static_cast <bool> (Align != 0 && (Align & (Align - 1)) == 0 && "Align must be a power of 2") ? void (0) : __assert_fail ("Align != 0 && (Align & (Align - 1)) == 0 && \"Align must be a power of 2\"" , "llvm/include/llvm/Support/MathExtras.h", 492, __extension__ __PRETTY_FUNCTION__)); |
| 493 | return (Value + Align - 1) & -Align; |
| 494 | } |
| 495 | |
| 496 | /// If non-zero \p Skew is specified, the return value will be a minimal integer |
| 497 | /// that is greater than or equal to \p Size and equal to \p A * N + \p Skew for |
| 498 | /// some integer N. If \p Skew is larger than \p A, its value is adjusted to '\p |
| 499 | /// Skew mod \p A'. \p Align must be non-zero. |
| 500 | /// |
| 501 | /// Examples: |
| 502 | /// \code |
| 503 | /// alignTo(5, 8, 7) = 7 |
| 504 | /// alignTo(17, 8, 1) = 17 |
| 505 | /// alignTo(~0LL, 8, 3) = 3 |
| 506 | /// alignTo(321, 255, 42) = 552 |
| 507 | /// \endcode |
| 508 | inline uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew) { |
| 509 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 509, __extension__ __PRETTY_FUNCTION__)); |
| 510 | Skew %= Align; |
| 511 | return alignTo(Value - Skew, Align) + Skew; |
| 512 | } |
| 513 | |
| 514 | /// Returns the next integer (mod 2**64) that is greater than or equal to |
| 515 | /// \p Value and is a multiple of \c Align. \c Align must be non-zero. |
| 516 | template <uint64_t Align> constexpr inline uint64_t alignTo(uint64_t Value) { |
| 517 | static_assert(Align != 0u, "Align must be non-zero"); |
| 518 | return (Value + Align - 1) / Align * Align; |
| 519 | } |
| 520 | |
| 521 | /// Returns the integer ceil(Numerator / Denominator). |
| 522 | inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) { |
| 523 | return alignTo(Numerator, Denominator) / Denominator; |
| 524 | } |
| 525 | |
| 526 | /// Returns the integer nearest(Numerator / Denominator). |
| 527 | inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) { |
| 528 | return (Numerator + (Denominator / 2)) / Denominator; |
| 529 | } |
| 530 | |
| 531 | /// Returns the largest uint64_t less than or equal to \p Value and is |
| 532 | /// \p Skew mod \p Align. \p Align must be non-zero |
| 533 | inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) { |
| 534 | assert(Align != 0u && "Align can't be 0.")(static_cast <bool> (Align != 0u && "Align can't be 0." ) ? void (0) : __assert_fail ("Align != 0u && \"Align can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 534, __extension__ __PRETTY_FUNCTION__)); |
| 535 | Skew %= Align; |
| 536 | return (Value - Skew) / Align * Align + Skew; |
| 537 | } |
| 538 | |
| 539 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. |
| 540 | /// Requires 0 < B <= 32. |
| 541 | template <unsigned B> constexpr inline int32_t SignExtend32(uint32_t X) { |
| 542 | static_assert(B > 0, "Bit width can't be 0."); |
| 543 | static_assert(B <= 32, "Bit width out of range."); |
| 544 | return int32_t(X << (32 - B)) >> (32 - B); |
| 545 | } |
| 546 | |
| 547 | /// Sign-extend the number in the bottom B bits of X to a 32-bit integer. |
| 548 | /// Requires 0 < B <= 32. |
| 549 | inline int32_t SignExtend32(uint32_t X, unsigned B) { |
| 550 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 550, __extension__ __PRETTY_FUNCTION__)); |
| 551 | assert(B <= 32 && "Bit width out of range.")(static_cast <bool> (B <= 32 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 32 && \"Bit width out of range.\"" , "llvm/include/llvm/Support/MathExtras.h", 551, __extension__ __PRETTY_FUNCTION__)); |
| 552 | return int32_t(X << (32 - B)) >> (32 - B); |
| 553 | } |
| 554 | |
| 555 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. |
| 556 | /// Requires 0 < B <= 64. |
| 557 | template <unsigned B> constexpr inline int64_t SignExtend64(uint64_t x) { |
| 558 | static_assert(B > 0, "Bit width can't be 0."); |
| 559 | static_assert(B <= 64, "Bit width out of range."); |
| 560 | return int64_t(x << (64 - B)) >> (64 - B); |
| 561 | } |
| 562 | |
| 563 | /// Sign-extend the number in the bottom B bits of X to a 64-bit integer. |
| 564 | /// Requires 0 < B <= 64. |
| 565 | inline int64_t SignExtend64(uint64_t X, unsigned B) { |
| 566 | assert(B > 0 && "Bit width can't be 0.")(static_cast <bool> (B > 0 && "Bit width can't be 0." ) ? void (0) : __assert_fail ("B > 0 && \"Bit width can't be 0.\"" , "llvm/include/llvm/Support/MathExtras.h", 566, __extension__ __PRETTY_FUNCTION__)); |
| 567 | assert(B <= 64 && "Bit width out of range.")(static_cast <bool> (B <= 64 && "Bit width out of range." ) ? void (0) : __assert_fail ("B <= 64 && \"Bit width out of range.\"" , "llvm/include/llvm/Support/MathExtras.h", 567, __extension__ __PRETTY_FUNCTION__)); |
| 568 | return int64_t(X << (64 - B)) >> (64 - B); |
| 569 | } |
| 570 | |
| 571 | /// Subtract two unsigned integers, X and Y, of type T and return the absolute |
| 572 | /// value of the result. |
| 573 | template <typename T> |
| 574 | std::enable_if_t<std::is_unsigned_v<T>, T> AbsoluteDifference(T X, T Y) { |
| 575 | return X > Y ? (X - Y) : (Y - X); |
| 576 | } |
| 577 | |
| 578 | /// Add two unsigned integers, X and Y, of type T. Clamp the result to the |
| 579 | /// maximum representable value of T on overflow. ResultOverflowed indicates if |
| 580 | /// the result is larger than the maximum representable value of type T. |
| 581 | template <typename T> |
| 582 | std::enable_if_t<std::is_unsigned_v<T>, T> |
| 583 | SaturatingAdd(T X, T Y, bool *ResultOverflowed = nullptr) { |
| 584 | bool Dummy; |
| 585 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
| 586 | // Hacker's Delight, p. 29 |
| 587 | T Z = X + Y; |
| 588 | Overflowed = (Z < X || Z < Y); |
| 589 | if (Overflowed) |
| 590 | return std::numeric_limits<T>::max(); |
| 591 | else |
| 592 | return Z; |
| 593 | } |
| 594 | |
| 595 | /// Add multiple unsigned integers of type T. Clamp the result to the |
| 596 | /// maximum representable value of T on overflow. |
| 597 | template <class T, class... Ts> |
| 598 | std::enable_if_t<std::is_unsigned_v<T>, T> SaturatingAdd(T X, T Y, T Z, |
| 599 | Ts... Args) { |
| 600 | bool Overflowed = false; |
| 601 | T XY = SaturatingAdd(X, Y, &Overflowed); |
| 602 | if (Overflowed) |
| 603 | return SaturatingAdd(std::numeric_limits<T>::max(), T(1), Args...); |
| 604 | return SaturatingAdd(XY, Z, Args...); |
| 605 | } |
| 606 | |
| 607 | /// Multiply two unsigned integers, X and Y, of type T. Clamp the result to the |
| 608 | /// maximum representable value of T on overflow. ResultOverflowed indicates if |
| 609 | /// the result is larger than the maximum representable value of type T. |
| 610 | template <typename T> |
| 611 | std::enable_if_t<std::is_unsigned_v<T>, T> |
| 612 | SaturatingMultiply(T X, T Y, bool *ResultOverflowed = nullptr) { |
| 613 | bool Dummy; |
| 614 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
| 615 | |
| 616 | // Hacker's Delight, p. 30 has a different algorithm, but we don't use that |
| 617 | // because it fails for uint16_t (where multiplication can have undefined |
| 618 | // behavior due to promotion to int), and requires a division in addition |
| 619 | // to the multiplication. |
| 620 | |
| 621 | Overflowed = false; |
| 622 | |
| 623 | // Log2(Z) would be either Log2Z or Log2Z + 1. |
| 624 | // Special case: if X or Y is 0, Log2_64 gives -1, and Log2Z |
| 625 | // will necessarily be less than Log2Max as desired. |
| 626 | int Log2Z = Log2_64(X) + Log2_64(Y); |
| 627 | const T Max = std::numeric_limits<T>::max(); |
| 628 | int Log2Max = Log2_64(Max); |
| 629 | if (Log2Z < Log2Max) { |
| 630 | return X * Y; |
| 631 | } |
| 632 | if (Log2Z > Log2Max) { |
| 633 | Overflowed = true; |
| 634 | return Max; |
| 635 | } |
| 636 | |
| 637 | // We're going to use the top bit, and maybe overflow one |
| 638 | // bit past it. Multiply all but the bottom bit then add |
| 639 | // that on at the end. |
| 640 | T Z = (X >> 1) * Y; |
| 641 | if (Z & ~(Max >> 1)) { |
| 642 | Overflowed = true; |
| 643 | return Max; |
| 644 | } |
| 645 | Z <<= 1; |
| 646 | if (X & 1) |
| 647 | return SaturatingAdd(Z, Y, ResultOverflowed); |
| 648 | |
| 649 | return Z; |
| 650 | } |
| 651 | |
| 652 | /// Multiply two unsigned integers, X and Y, and add the unsigned integer, A to |
| 653 | /// the product. Clamp the result to the maximum representable value of T on |
| 654 | /// overflow. ResultOverflowed indicates if the result is larger than the |
| 655 | /// maximum representable value of type T. |
| 656 | template <typename T> |
| 657 | std::enable_if_t<std::is_unsigned_v<T>, T> |
| 658 | SaturatingMultiplyAdd(T X, T Y, T A, bool *ResultOverflowed = nullptr) { |
| 659 | bool Dummy; |
| 660 | bool &Overflowed = ResultOverflowed ? *ResultOverflowed : Dummy; |
| 661 | |
| 662 | T Product = SaturatingMultiply(X, Y, &Overflowed); |
| 663 | if (Overflowed) |
| 664 | return Product; |
| 665 | |
| 666 | return SaturatingAdd(A, Product, &Overflowed); |
| 667 | } |
| 668 | |
| 669 | /// Use this rather than HUGE_VALF; the latter causes warnings on MSVC. |
| 670 | extern const float huge_valf; |
| 671 | |
| 672 | |
| 673 | /// Add two signed integers, computing the two's complement truncated result, |
| 674 | /// returning true if overflow occurred. |
| 675 | template <typename T> |
| 676 | std::enable_if_t<std::is_signed_v<T>, T> AddOverflow(T X, T Y, T &Result) { |
| 677 | #if __has_builtin(__builtin_add_overflow)1 |
| 678 | return __builtin_add_overflow(X, Y, &Result); |
| 679 | #else |
| 680 | // Perform the unsigned addition. |
| 681 | using U = std::make_unsigned_t<T>; |
| 682 | const U UX = static_cast<U>(X); |
| 683 | const U UY = static_cast<U>(Y); |
| 684 | const U UResult = UX + UY; |
| 685 | |
| 686 | // Convert to signed. |
| 687 | Result = static_cast<T>(UResult); |
| 688 | |
| 689 | // Adding two positive numbers should result in a positive number. |
| 690 | if (X > 0 && Y > 0) |
| 691 | return Result <= 0; |
| 692 | // Adding two negatives should result in a negative number. |
| 693 | if (X < 0 && Y < 0) |
| 694 | return Result >= 0; |
| 695 | return false; |
| 696 | #endif |
| 697 | } |
| 698 | |
| 699 | /// Subtract two signed integers, computing the two's complement truncated |
| 700 | /// result, returning true if an overflow ocurred. |
| 701 | template <typename T> |
| 702 | std::enable_if_t<std::is_signed_v<T>, T> SubOverflow(T X, T Y, T &Result) { |
| 703 | #if __has_builtin(__builtin_sub_overflow)1 |
| 704 | return __builtin_sub_overflow(X, Y, &Result); |
| 705 | #else |
| 706 | // Perform the unsigned addition. |
| 707 | using U = std::make_unsigned_t<T>; |
| 708 | const U UX = static_cast<U>(X); |
| 709 | const U UY = static_cast<U>(Y); |
| 710 | const U UResult = UX - UY; |
| 711 | |
| 712 | // Convert to signed. |
| 713 | Result = static_cast<T>(UResult); |
| 714 | |
| 715 | // Subtracting a positive number from a negative results in a negative number. |
| 716 | if (X <= 0 && Y > 0) |
| 717 | return Result >= 0; |
| 718 | // Subtracting a negative number from a positive results in a positive number. |
| 719 | if (X >= 0 && Y < 0) |
| 720 | return Result <= 0; |
| 721 | return false; |
| 722 | #endif |
| 723 | } |
| 724 | |
| 725 | /// Multiply two signed integers, computing the two's complement truncated |
| 726 | /// result, returning true if an overflow ocurred. |
| 727 | template <typename T> |
| 728 | std::enable_if_t<std::is_signed_v<T>, T> MulOverflow(T X, T Y, T &Result) { |
| 729 | // Perform the unsigned multiplication on absolute values. |
| 730 | using U = std::make_unsigned_t<T>; |
| 731 | const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X); |
| 732 | const U UY = Y < 0 ? (0 - static_cast<U>(Y)) : static_cast<U>(Y); |
| 733 | const U UResult = UX * UY; |
| 734 | |
| 735 | // Convert to signed. |
| 736 | const bool IsNegative = (X < 0) ^ (Y < 0); |
| 737 | Result = IsNegative ? (0 - UResult) : UResult; |
| 738 | |
| 739 | // If any of the args was 0, result is 0 and no overflow occurs. |
| 740 | if (UX == 0 || UY == 0) |
| 741 | return false; |
| 742 | |
| 743 | // UX and UY are in [1, 2^n], where n is the number of digits. |
| 744 | // Check how the max allowed absolute value (2^n for negative, 2^(n-1) for |
| 745 | // positive) divided by an argument compares to the other. |
| 746 | if (IsNegative) |
| 747 | return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY; |
| 748 | else |
| 749 | return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY; |
| 750 | } |
| 751 | |
| 752 | } // End llvm namespace |
| 753 | |
| 754 | #endif |