| File: | build/source/llvm/lib/Target/X86/X86TargetTransformInfo.cpp |
| Warning: | line 4329, column 15 Division by zero |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// | |||
| 2 | // | |||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | |||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
| 6 | // | |||
| 7 | //===----------------------------------------------------------------------===// | |||
| 8 | /// \file | |||
| 9 | /// This file implements a TargetTransformInfo analysis pass specific to the | |||
| 10 | /// X86 target machine. It uses the target's detailed information to provide | |||
| 11 | /// more precise answers to certain TTI queries, while letting the target | |||
| 12 | /// independent and default TTI implementations handle the rest. | |||
| 13 | /// | |||
| 14 | //===----------------------------------------------------------------------===// | |||
| 15 | /// About Cost Model numbers used below it's necessary to say the following: | |||
| 16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of a | |||
| 17 | /// specific CPU model. Usually the numbers correspond to the CPU where the | |||
| 18 | /// feature first appeared. For example, if we do Subtarget.hasSSE42() in | |||
| 19 | /// the lookups below the cost is based on Nehalem as that was the first CPU | |||
| 20 | /// to support that feature level and thus has most likely the worst case cost, | |||
| 21 | /// although we may discard an outlying worst cost from one CPU (e.g. Atom). | |||
| 22 | /// | |||
| 23 | /// Some examples of other technologies/CPUs: | |||
| 24 | /// SSE 3 - Pentium4 / Athlon64 | |||
| 25 | /// SSE 4.1 - Penryn | |||
| 26 | /// SSE 4.2 - Nehalem / Silvermont | |||
| 27 | /// AVX - Sandy Bridge / Jaguar / Bulldozer | |||
| 28 | /// AVX2 - Haswell / Ryzen | |||
| 29 | /// AVX-512 - Xeon Phi / Skylake | |||
| 30 | /// | |||
| 31 | /// And some examples of instruction target dependent costs (latency) | |||
| 32 | /// divss sqrtss rsqrtss | |||
| 33 | /// AMD K7 11-16 19 3 | |||
| 34 | /// Piledriver 9-24 13-15 5 | |||
| 35 | /// Jaguar 14 16 2 | |||
| 36 | /// Pentium II,III 18 30 2 | |||
| 37 | /// Nehalem 7-14 7-18 3 | |||
| 38 | /// Haswell 10-13 11 5 | |||
| 39 | /// | |||
| 40 | /// Interpreting the 4 TargetCostKind types: | |||
| 41 | /// TCK_RecipThroughput and TCK_Latency should try to match the worst case | |||
| 42 | /// values reported by the CPU scheduler models (and llvm-mca). | |||
| 43 | /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the | |||
| 44 | /// actual encoding size of the instruction. | |||
| 45 | /// TCK_SizeAndLatency should match the worst case micro-op counts reported by | |||
| 46 | /// by the CPU scheduler models (and llvm-mca), to ensure that they are | |||
| 47 | /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are | |||
| 48 | /// often used as the cost thresholds where TCK_SizeAndLatency is requested. | |||
| 49 | //===----------------------------------------------------------------------===// | |||
| 50 | ||||
| 51 | #include "X86TargetTransformInfo.h" | |||
| 52 | #include "llvm/Analysis/TargetTransformInfo.h" | |||
| 53 | #include "llvm/CodeGen/BasicTTIImpl.h" | |||
| 54 | #include "llvm/CodeGen/CostTable.h" | |||
| 55 | #include "llvm/CodeGen/TargetLowering.h" | |||
| 56 | #include "llvm/IR/InstIterator.h" | |||
| 57 | #include "llvm/IR/IntrinsicInst.h" | |||
| 58 | #include "llvm/Support/Debug.h" | |||
| 59 | #include <optional> | |||
| 60 | ||||
| 61 | using namespace llvm; | |||
| 62 | ||||
| 63 | #define DEBUG_TYPE"x86tti" "x86tti" | |||
| 64 | ||||
| 65 | //===----------------------------------------------------------------------===// | |||
| 66 | // | |||
| 67 | // X86 cost model. | |||
| 68 | // | |||
| 69 | //===----------------------------------------------------------------------===// | |||
| 70 | ||||
| 71 | // Helper struct to store/access costs for each cost kind. | |||
| 72 | // TODO: Move this to allow other targets to use it? | |||
| 73 | struct CostKindCosts { | |||
| 74 | unsigned RecipThroughputCost = ~0U; | |||
| 75 | unsigned LatencyCost = ~0U; | |||
| 76 | unsigned CodeSizeCost = ~0U; | |||
| 77 | unsigned SizeAndLatencyCost = ~0U; | |||
| 78 | ||||
| 79 | std::optional<unsigned> | |||
| 80 | operator[](TargetTransformInfo::TargetCostKind Kind) const { | |||
| 81 | unsigned Cost = ~0U; | |||
| 82 | switch (Kind) { | |||
| 83 | case TargetTransformInfo::TCK_RecipThroughput: | |||
| 84 | Cost = RecipThroughputCost; | |||
| 85 | break; | |||
| 86 | case TargetTransformInfo::TCK_Latency: | |||
| 87 | Cost = LatencyCost; | |||
| 88 | break; | |||
| 89 | case TargetTransformInfo::TCK_CodeSize: | |||
| 90 | Cost = CodeSizeCost; | |||
| 91 | break; | |||
| 92 | case TargetTransformInfo::TCK_SizeAndLatency: | |||
| 93 | Cost = SizeAndLatencyCost; | |||
| 94 | break; | |||
| 95 | } | |||
| 96 | if (Cost == ~0U) | |||
| 97 | return std::nullopt; | |||
| 98 | return Cost; | |||
| 99 | } | |||
| 100 | }; | |||
| 101 | using CostKindTblEntry = CostTblEntryT<CostKindCosts>; | |||
| 102 | ||||
| 103 | TargetTransformInfo::PopcntSupportKind | |||
| 104 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { | |||
| 105 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2")(static_cast <bool> (isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ) ? void (0) : __assert_fail ("isPowerOf2_32(TyWidth) && \"Ty width must be power of 2\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 105, __extension__ __PRETTY_FUNCTION__)); | |||
| 106 | // TODO: Currently the __builtin_popcount() implementation using SSE3 | |||
| 107 | // instructions is inefficient. Once the problem is fixed, we should | |||
| 108 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). | |||
| 109 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; | |||
| 110 | } | |||
| 111 | ||||
| 112 | std::optional<unsigned> X86TTIImpl::getCacheSize( | |||
| 113 | TargetTransformInfo::CacheLevel Level) const { | |||
| 114 | switch (Level) { | |||
| 115 | case TargetTransformInfo::CacheLevel::L1D: | |||
| 116 | // - Penryn | |||
| 117 | // - Nehalem | |||
| 118 | // - Westmere | |||
| 119 | // - Sandy Bridge | |||
| 120 | // - Ivy Bridge | |||
| 121 | // - Haswell | |||
| 122 | // - Broadwell | |||
| 123 | // - Skylake | |||
| 124 | // - Kabylake | |||
| 125 | return 32 * 1024; // 32 KByte | |||
| 126 | case TargetTransformInfo::CacheLevel::L2D: | |||
| 127 | // - Penryn | |||
| 128 | // - Nehalem | |||
| 129 | // - Westmere | |||
| 130 | // - Sandy Bridge | |||
| 131 | // - Ivy Bridge | |||
| 132 | // - Haswell | |||
| 133 | // - Broadwell | |||
| 134 | // - Skylake | |||
| 135 | // - Kabylake | |||
| 136 | return 256 * 1024; // 256 KByte | |||
| 137 | } | |||
| 138 | ||||
| 139 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 139); | |||
| 140 | } | |||
| 141 | ||||
| 142 | std::optional<unsigned> X86TTIImpl::getCacheAssociativity( | |||
| 143 | TargetTransformInfo::CacheLevel Level) const { | |||
| 144 | // - Penryn | |||
| 145 | // - Nehalem | |||
| 146 | // - Westmere | |||
| 147 | // - Sandy Bridge | |||
| 148 | // - Ivy Bridge | |||
| 149 | // - Haswell | |||
| 150 | // - Broadwell | |||
| 151 | // - Skylake | |||
| 152 | // - Kabylake | |||
| 153 | switch (Level) { | |||
| 154 | case TargetTransformInfo::CacheLevel::L1D: | |||
| 155 | [[fallthrough]]; | |||
| 156 | case TargetTransformInfo::CacheLevel::L2D: | |||
| 157 | return 8; | |||
| 158 | } | |||
| 159 | ||||
| 160 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel")::llvm::llvm_unreachable_internal("Unknown TargetTransformInfo::CacheLevel" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 160); | |||
| 161 | } | |||
| 162 | ||||
| 163 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { | |||
| 164 | bool Vector = (ClassID == 1); | |||
| 165 | if (Vector && !ST->hasSSE1()) | |||
| 166 | return 0; | |||
| 167 | ||||
| 168 | if (ST->is64Bit()) { | |||
| 169 | if (Vector && ST->hasAVX512()) | |||
| 170 | return 32; | |||
| 171 | return 16; | |||
| 172 | } | |||
| 173 | return 8; | |||
| 174 | } | |||
| 175 | ||||
| 176 | TypeSize | |||
| 177 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { | |||
| 178 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); | |||
| 179 | switch (K) { | |||
| 180 | case TargetTransformInfo::RGK_Scalar: | |||
| 181 | return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); | |||
| 182 | case TargetTransformInfo::RGK_FixedWidthVector: | |||
| 183 | if (ST->hasAVX512() && PreferVectorWidth >= 512) | |||
| 184 | return TypeSize::getFixed(512); | |||
| 185 | if (ST->hasAVX() && PreferVectorWidth >= 256) | |||
| 186 | return TypeSize::getFixed(256); | |||
| 187 | if (ST->hasSSE1() && PreferVectorWidth >= 128) | |||
| 188 | return TypeSize::getFixed(128); | |||
| 189 | return TypeSize::getFixed(0); | |||
| 190 | case TargetTransformInfo::RGK_ScalableVector: | |||
| 191 | return TypeSize::getScalable(0); | |||
| 192 | } | |||
| 193 | ||||
| 194 | llvm_unreachable("Unsupported register kind")::llvm::llvm_unreachable_internal("Unsupported register kind" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 194); | |||
| 195 | } | |||
| 196 | ||||
| 197 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { | |||
| 198 | return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) | |||
| 199 | .getFixedSize(); | |||
| 200 | } | |||
| 201 | ||||
| 202 | unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { | |||
| 203 | // If the loop will not be vectorized, don't interleave the loop. | |||
| 204 | // Let regular unroll to unroll the loop, which saves the overflow | |||
| 205 | // check and memory check cost. | |||
| 206 | if (VF == 1) | |||
| 207 | return 1; | |||
| 208 | ||||
| 209 | if (ST->isAtom()) | |||
| 210 | return 1; | |||
| 211 | ||||
| 212 | // Sandybridge and Haswell have multiple execution ports and pipelined | |||
| 213 | // vector units. | |||
| 214 | if (ST->hasAVX()) | |||
| 215 | return 4; | |||
| 216 | ||||
| 217 | return 2; | |||
| 218 | } | |||
| 219 | ||||
| 220 | InstructionCost X86TTIImpl::getArithmeticInstrCost( | |||
| 221 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, | |||
| 222 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, | |||
| 223 | ArrayRef<const Value *> Args, | |||
| 224 | const Instruction *CxtI) { | |||
| 225 | ||||
| 226 | // vXi8 multiplications are always promoted to vXi16. | |||
| 227 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && | |||
| 228 | Ty->getScalarSizeInBits() == 8) { | |||
| 229 | Type *WideVecTy = | |||
| 230 | VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); | |||
| 231 | return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, | |||
| 232 | TargetTransformInfo::CastContextHint::None, | |||
| 233 | CostKind) + | |||
| 234 | getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, | |||
| 235 | TargetTransformInfo::CastContextHint::None, | |||
| 236 | CostKind) + | |||
| 237 | getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info); | |||
| 238 | } | |||
| 239 | ||||
| 240 | // Legalize the type. | |||
| 241 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
| 242 | ||||
| 243 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 244 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 244, __extension__ __PRETTY_FUNCTION__)); | |||
| 245 | ||||
| 246 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && | |||
| 247 | LT.second.getScalarType() == MVT::i32) { | |||
| 248 | // Check if the operands can be represented as a smaller datatype. | |||
| 249 | bool Op1Signed = false, Op2Signed = false; | |||
| 250 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); | |||
| 251 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); | |||
| 252 | unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); | |||
| 253 | bool SignedMode = Op1Signed || Op2Signed; | |||
| 254 | ||||
| 255 | // If both are representable as i15 and at least one is constant, | |||
| 256 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we | |||
| 257 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. | |||
| 258 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { | |||
| 259 | bool Op1Constant = | |||
| 260 | isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); | |||
| 261 | bool Op2Constant = | |||
| 262 | isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); | |||
| 263 | bool Op1Sext = isa<SExtInst>(Args[0]) && | |||
| 264 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); | |||
| 265 | bool Op2Sext = isa<SExtInst>(Args[1]) && | |||
| 266 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); | |||
| 267 | ||||
| 268 | bool IsZeroExtended = !Op1Signed || !Op2Signed; | |||
| 269 | bool IsConstant = Op1Constant || Op2Constant; | |||
| 270 | bool IsSext = Op1Sext || Op2Sext; | |||
| 271 | if (IsConstant || IsZeroExtended || IsSext) | |||
| 272 | LT.second = | |||
| 273 | MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); | |||
| 274 | } | |||
| 275 | ||||
| 276 | // Check if the vXi32 operands can be shrunk into a smaller datatype. | |||
| 277 | // This should match the codegen from reduceVMULWidth. | |||
| 278 | // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). | |||
| 279 | if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { | |||
| 280 | if (OpMinSize <= 7) | |||
| 281 | return LT.first * 3; // pmullw/sext | |||
| 282 | if (!SignedMode && OpMinSize <= 8) | |||
| 283 | return LT.first * 3; // pmullw/zext | |||
| 284 | if (OpMinSize <= 15) | |||
| 285 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
| 286 | if (!SignedMode && OpMinSize <= 16) | |||
| 287 | return LT.first * 5; // pmullw/pmulhw/pshuf | |||
| 288 | } | |||
| 289 | } | |||
| 290 | ||||
| 291 | // Vector multiply by pow2 will be simplified to shifts. | |||
| 292 | // Vector multiply by -pow2 will be simplified to shifts/negates. | |||
| 293 | if (ISD == ISD::MUL && Op2Info.isConstant() && | |||
| 294 | (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { | |||
| 295 | InstructionCost Cost = | |||
| 296 | getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, | |||
| 297 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 298 | if (Op2Info.isNegatedPowerOf2()) | |||
| 299 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); | |||
| 300 | return Cost; | |||
| 301 | } | |||
| 302 | ||||
| 303 | // On X86, vector signed division by constants power-of-two are | |||
| 304 | // normally expanded to the sequence SRA + SRL + ADD + SRA. | |||
| 305 | // The OperandValue properties may not be the same as that of the previous | |||
| 306 | // operation; conservatively assume OP_None. | |||
| 307 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && | |||
| 308 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { | |||
| 309 | InstructionCost Cost = | |||
| 310 | 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | |||
| 311 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 312 | Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | |||
| 313 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 314 | Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, | |||
| 315 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 316 | ||||
| 317 | if (ISD == ISD::SREM) { | |||
| 318 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) | |||
| 319 | Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), | |||
| 320 | Op2Info.getNoProps()); | |||
| 321 | Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(), | |||
| 322 | Op2Info.getNoProps()); | |||
| 323 | } | |||
| 324 | ||||
| 325 | return Cost; | |||
| 326 | } | |||
| 327 | ||||
| 328 | // Vector unsigned division/remainder will be simplified to shifts/masks. | |||
| 329 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && | |||
| 330 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { | |||
| 331 | if (ISD == ISD::UDIV) | |||
| 332 | return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, | |||
| 333 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 334 | // UREM | |||
| 335 | return getArithmeticInstrCost(Instruction::And, Ty, CostKind, | |||
| 336 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 337 | } | |||
| 338 | ||||
| 339 | static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { | |||
| 340 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. | |||
| 341 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. | |||
| 342 | { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb. | |||
| 343 | { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand. | |||
| 344 | { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand. | |||
| 345 | { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb. | |||
| 346 | { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand. | |||
| 347 | { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand. | |||
| 348 | { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb. | |||
| 349 | ||||
| 350 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw | |||
| 351 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw | |||
| 352 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw | |||
| 353 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw | |||
| 354 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw | |||
| 355 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw | |||
| 356 | }; | |||
| 357 | ||||
| 358 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) | |||
| 359 | if (const auto *Entry = | |||
| 360 | CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second)) | |||
| 361 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 362 | return LT.first * *KindCost; | |||
| 363 | ||||
| 364 | static const CostKindTblEntry AVX512UniformConstCostTable[] = { | |||
| 365 | { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand. | |||
| 366 | { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand. | |||
| 367 | { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb. | |||
| 368 | ||||
| 369 | { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split. | |||
| 370 | { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split. | |||
| 371 | { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split. | |||
| 372 | ||||
| 373 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld | |||
| 374 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld | |||
| 375 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad | |||
| 376 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld | |||
| 377 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld | |||
| 378 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad | |||
| 379 | ||||
| 380 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq | |||
| 381 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq | |||
| 382 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq | |||
| 383 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq | |||
| 384 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq | |||
| 385 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq | |||
| 386 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq | |||
| 387 | ||||
| 388 | { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence | |||
| 389 | { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence | |||
| 390 | { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence | |||
| 391 | { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence | |||
| 392 | }; | |||
| 393 | ||||
| 394 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) | |||
| 395 | if (const auto *Entry = | |||
| 396 | CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second)) | |||
| 397 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 398 | return LT.first * *KindCost; | |||
| 399 | ||||
| 400 | static const CostKindTblEntry AVX2UniformConstCostTable[] = { | |||
| 401 | { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand. | |||
| 402 | { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand. | |||
| 403 | { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb. | |||
| 404 | { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand. | |||
| 405 | { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand. | |||
| 406 | { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb. | |||
| 407 | ||||
| 408 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw | |||
| 409 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw | |||
| 410 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw | |||
| 411 | { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw | |||
| 412 | { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw | |||
| 413 | { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw | |||
| 414 | ||||
| 415 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld | |||
| 416 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld | |||
| 417 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad | |||
| 418 | { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld | |||
| 419 | { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld | |||
| 420 | { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad | |||
| 421 | ||||
| 422 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq | |||
| 423 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq | |||
| 424 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. | |||
| 425 | { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq | |||
| 426 | { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq | |||
| 427 | { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split. | |||
| 428 | ||||
| 429 | { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence | |||
| 430 | { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence | |||
| 431 | { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence | |||
| 432 | { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence | |||
| 433 | }; | |||
| 434 | ||||
| 435 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) | |||
| 436 | if (const auto *Entry = | |||
| 437 | CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second)) | |||
| 438 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 439 | return LT.first * *KindCost; | |||
| 440 | ||||
| 441 | static const CostKindTblEntry AVXUniformConstCostTable[] = { | |||
| 442 | { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand. | |||
| 443 | { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand. | |||
| 444 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. | |||
| 445 | { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split. | |||
| 446 | { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split. | |||
| 447 | { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. | |||
| 448 | ||||
| 449 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw. | |||
| 450 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw. | |||
| 451 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw. | |||
| 452 | { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split. | |||
| 453 | { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split. | |||
| 454 | { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split. | |||
| 455 | ||||
| 456 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld. | |||
| 457 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld. | |||
| 458 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad. | |||
| 459 | { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split. | |||
| 460 | { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split. | |||
| 461 | { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split. | |||
| 462 | ||||
| 463 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq. | |||
| 464 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq. | |||
| 465 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. | |||
| 466 | { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. | |||
| 467 | { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. | |||
| 468 | { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split. | |||
| 469 | ||||
| 470 | { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split. | |||
| 471 | { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split. | |||
| 472 | { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split. | |||
| 473 | { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split. | |||
| 474 | }; | |||
| 475 | ||||
| 476 | // XOP has faster vXi8 shifts. | |||
| 477 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && | |||
| 478 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
| 479 | if (const auto *Entry = | |||
| 480 | CostTableLookup(AVXUniformConstCostTable, ISD, LT.second)) | |||
| 481 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 482 | return LT.first * *KindCost; | |||
| 483 | ||||
| 484 | static const CostKindTblEntry SSE2UniformConstCostTable[] = { | |||
| 485 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. | |||
| 486 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. | |||
| 487 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. | |||
| 488 | ||||
| 489 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw. | |||
| 490 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw. | |||
| 491 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw. | |||
| 492 | ||||
| 493 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld | |||
| 494 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld. | |||
| 495 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad. | |||
| 496 | ||||
| 497 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq. | |||
| 498 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq. | |||
| 499 | { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle. | |||
| 500 | ||||
| 501 | { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence | |||
| 502 | { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence | |||
| 503 | { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence | |||
| 504 | { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence | |||
| 505 | }; | |||
| 506 | ||||
| 507 | // XOP has faster vXi8 shifts. | |||
| 508 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && | |||
| 509 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
| 510 | if (const auto *Entry = | |||
| 511 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) | |||
| 512 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 513 | return LT.first * *KindCost; | |||
| 514 | ||||
| 515 | static const CostKindTblEntry AVX512BWConstCostTable[] = { | |||
| 516 | { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
| 517 | { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
| 518 | { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
| 519 | { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
| 520 | ||||
| 521 | { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence | |||
| 522 | { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence | |||
| 523 | { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence | |||
| 524 | { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence | |||
| 525 | }; | |||
| 526 | ||||
| 527 | if (Op2Info.isConstant() && ST->hasBWI()) | |||
| 528 | if (const auto *Entry = | |||
| 529 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) | |||
| 530 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 531 | return LT.first * *KindCost; | |||
| 532 | ||||
| 533 | static const CostKindTblEntry AVX512ConstCostTable[] = { | |||
| 534 | { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence | |||
| 535 | { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence | |||
| 536 | { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence | |||
| 537 | { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence | |||
| 538 | ||||
| 539 | { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence | |||
| 540 | { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence | |||
| 541 | { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence | |||
| 542 | { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence | |||
| 543 | ||||
| 544 | { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence | |||
| 545 | { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence | |||
| 546 | { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence | |||
| 547 | { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence | |||
| 548 | }; | |||
| 549 | ||||
| 550 | if (Op2Info.isConstant() && ST->hasAVX512()) | |||
| 551 | if (const auto *Entry = | |||
| 552 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) | |||
| 553 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 554 | return LT.first * *KindCost; | |||
| 555 | ||||
| 556 | static const CostKindTblEntry AVX2ConstCostTable[] = { | |||
| 557 | { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
| 558 | { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
| 559 | { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
| 560 | { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
| 561 | ||||
| 562 | { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence | |||
| 563 | { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence | |||
| 564 | { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence | |||
| 565 | { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence | |||
| 566 | ||||
| 567 | { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence | |||
| 568 | { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence | |||
| 569 | { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence | |||
| 570 | { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence | |||
| 571 | }; | |||
| 572 | ||||
| 573 | if (Op2Info.isConstant() && ST->hasAVX2()) | |||
| 574 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) | |||
| 575 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 576 | return LT.first * *KindCost; | |||
| 577 | ||||
| 578 | static const CostKindTblEntry AVXConstCostTable[] = { | |||
| 579 | { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. | |||
| 580 | { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
| 581 | { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. | |||
| 582 | { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. | |||
| 583 | ||||
| 584 | { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split. | |||
| 585 | { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split. | |||
| 586 | { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split. | |||
| 587 | { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split. | |||
| 588 | ||||
| 589 | { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence | |||
| 590 | { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence | |||
| 591 | { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split. | |||
| 592 | { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split. | |||
| 593 | }; | |||
| 594 | ||||
| 595 | if (Op2Info.isConstant() && ST->hasAVX()) | |||
| 596 | if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second)) | |||
| 597 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 598 | return LT.first * *KindCost; | |||
| 599 | ||||
| 600 | static const CostKindTblEntry SSE41ConstCostTable[] = { | |||
| 601 | { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence | |||
| 602 | { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence | |||
| 603 | }; | |||
| 604 | ||||
| 605 | if (Op2Info.isConstant() && ST->hasSSE41()) | |||
| 606 | if (const auto *Entry = | |||
| 607 | CostTableLookup(SSE41ConstCostTable, ISD, LT.second)) | |||
| 608 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 609 | return LT.first * *KindCost; | |||
| 610 | ||||
| 611 | static const CostKindTblEntry SSE2ConstCostTable[] = { | |||
| 612 | { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
| 613 | { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
| 614 | { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence | |||
| 615 | { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence | |||
| 616 | ||||
| 617 | { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence | |||
| 618 | { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence | |||
| 619 | { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence | |||
| 620 | { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence | |||
| 621 | ||||
| 622 | { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence | |||
| 623 | { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence | |||
| 624 | { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence | |||
| 625 | { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence | |||
| 626 | }; | |||
| 627 | ||||
| 628 | if (Op2Info.isConstant() && ST->hasSSE2()) | |||
| 629 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) | |||
| 630 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 631 | return LT.first * *KindCost; | |||
| 632 | ||||
| 633 | static const CostKindTblEntry AVX512BWUniformCostTable[] = { | |||
| 634 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. | |||
| 635 | { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand. | |||
| 636 | { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb. | |||
| 637 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. | |||
| 638 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. | |||
| 639 | { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb. | |||
| 640 | { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand. | |||
| 641 | { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand. | |||
| 642 | { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb. | |||
| 643 | ||||
| 644 | { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw | |||
| 645 | { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw | |||
| 646 | { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw | |||
| 647 | }; | |||
| 648 | ||||
| 649 | if (ST->hasBWI() && Op2Info.isUniform()) | |||
| 650 | if (const auto *Entry = | |||
| 651 | CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second)) | |||
| 652 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 653 | return LT.first * *KindCost; | |||
| 654 | ||||
| 655 | static const CostKindTblEntry AVX512UniformCostTable[] = { | |||
| 656 | { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split. | |||
| 657 | { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split. | |||
| 658 | { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split. | |||
| 659 | ||||
| 660 | { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld | |||
| 661 | { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld | |||
| 662 | { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad | |||
| 663 | ||||
| 664 | { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq | |||
| 665 | { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq | |||
| 666 | { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq | |||
| 667 | { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq | |||
| 668 | { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq | |||
| 669 | { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq | |||
| 670 | { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq | |||
| 671 | }; | |||
| 672 | ||||
| 673 | if (ST->hasAVX512() && Op2Info.isUniform()) | |||
| 674 | if (const auto *Entry = | |||
| 675 | CostTableLookup(AVX512UniformCostTable, ISD, LT.second)) | |||
| 676 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 677 | return LT.first * *KindCost; | |||
| 678 | ||||
| 679 | static const CostKindTblEntry AVX2UniformCostTable[] = { | |||
| 680 | // Uniform splats are cheaper for the following instructions. | |||
| 681 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. | |||
| 682 | { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand. | |||
| 683 | { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb. | |||
| 684 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. | |||
| 685 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. | |||
| 686 | { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb. | |||
| 687 | ||||
| 688 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw. | |||
| 689 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw. | |||
| 690 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw. | |||
| 691 | { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw. | |||
| 692 | { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw. | |||
| 693 | { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw. | |||
| 694 | ||||
| 695 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld | |||
| 696 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld | |||
| 697 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad | |||
| 698 | { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld | |||
| 699 | { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld | |||
| 700 | { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad | |||
| 701 | ||||
| 702 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq | |||
| 703 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq | |||
| 704 | { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle. | |||
| 705 | { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq | |||
| 706 | { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq | |||
| 707 | { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle. | |||
| 708 | }; | |||
| 709 | ||||
| 710 | if (ST->hasAVX2() && Op2Info.isUniform()) | |||
| 711 | if (const auto *Entry = | |||
| 712 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) | |||
| 713 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 714 | return LT.first * *KindCost; | |||
| 715 | ||||
| 716 | static const CostKindTblEntry AVXUniformCostTable[] = { | |||
| 717 | { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand. | |||
| 718 | { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand. | |||
| 719 | { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb. | |||
| 720 | { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split. | |||
| 721 | { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split. | |||
| 722 | { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split. | |||
| 723 | ||||
| 724 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw. | |||
| 725 | { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw. | |||
| 726 | { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw. | |||
| 727 | { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split. | |||
| 728 | { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split. | |||
| 729 | { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split. | |||
| 730 | ||||
| 731 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld. | |||
| 732 | { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld. | |||
| 733 | { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad. | |||
| 734 | { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split. | |||
| 735 | { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split. | |||
| 736 | { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split. | |||
| 737 | ||||
| 738 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq. | |||
| 739 | { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq. | |||
| 740 | { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle. | |||
| 741 | { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split. | |||
| 742 | { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split. | |||
| 743 | { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split. | |||
| 744 | }; | |||
| 745 | ||||
| 746 | // XOP has faster vXi8 shifts. | |||
| 747 | if (ST->hasAVX() && Op2Info.isUniform() && | |||
| 748 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
| 749 | if (const auto *Entry = | |||
| 750 | CostTableLookup(AVXUniformCostTable, ISD, LT.second)) | |||
| 751 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 752 | return LT.first * *KindCost; | |||
| 753 | ||||
| 754 | static const CostKindTblEntry SSE2UniformCostTable[] = { | |||
| 755 | // Uniform splats are cheaper for the following instructions. | |||
| 756 | { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand. | |||
| 757 | { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand. | |||
| 758 | { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence. | |||
| 759 | ||||
| 760 | { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw. | |||
| 761 | { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw. | |||
| 762 | { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw. | |||
| 763 | ||||
| 764 | { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld | |||
| 765 | { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld. | |||
| 766 | { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad. | |||
| 767 | ||||
| 768 | { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq. | |||
| 769 | { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq. | |||
| 770 | { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub. | |||
| 771 | }; | |||
| 772 | ||||
| 773 | if (ST->hasSSE2() && Op2Info.isUniform() && | |||
| 774 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) | |||
| 775 | if (const auto *Entry = | |||
| 776 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) | |||
| 777 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 778 | return LT.first * *KindCost; | |||
| 779 | ||||
| 780 | static const CostKindTblEntry AVX512DQCostTable[] = { | |||
| 781 | { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq | |||
| 782 | { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq | |||
| 783 | { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq | |||
| 784 | }; | |||
| 785 | ||||
| 786 | // Look for AVX512DQ lowering tricks for custom cases. | |||
| 787 | if (ST->hasDQI()) | |||
| 788 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) | |||
| 789 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 790 | return LT.first * *KindCost; | |||
| 791 | ||||
| 792 | static const CostKindTblEntry AVX512BWCostTable[] = { | |||
| 793 | { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence. | |||
| 794 | { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence. | |||
| 795 | { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence. | |||
| 796 | { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence. | |||
| 797 | { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence. | |||
| 798 | { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence. | |||
| 799 | { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence. | |||
| 800 | { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence. | |||
| 801 | { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence. | |||
| 802 | ||||
| 803 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw | |||
| 804 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw | |||
| 805 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw | |||
| 806 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw | |||
| 807 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw | |||
| 808 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw | |||
| 809 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw | |||
| 810 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw | |||
| 811 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw | |||
| 812 | ||||
| 813 | { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb | |||
| 814 | { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw | |||
| 815 | ||||
| 816 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb | |||
| 817 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw | |||
| 818 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd | |||
| 819 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq | |||
| 820 | ||||
| 821 | { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb | |||
| 822 | { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw | |||
| 823 | ||||
| 824 | { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw | |||
| 825 | ||||
| 826 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb | |||
| 827 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw | |||
| 828 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd | |||
| 829 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq | |||
| 830 | }; | |||
| 831 | ||||
| 832 | // Look for AVX512BW lowering tricks for custom cases. | |||
| 833 | if (ST->hasBWI()) | |||
| 834 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) | |||
| 835 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 836 | return LT.first * *KindCost; | |||
| 837 | ||||
| 838 | static const CostKindTblEntry AVX512CostTable[] = { | |||
| 839 | { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence. | |||
| 840 | { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence. | |||
| 841 | { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence. | |||
| 842 | ||||
| 843 | { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. | |||
| 844 | { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. | |||
| 845 | { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence. | |||
| 846 | ||||
| 847 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 848 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 849 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 850 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 851 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 852 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 853 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 854 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 855 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 856 | ||||
| 857 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 858 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 859 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 860 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 861 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 862 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 863 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 864 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 865 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 866 | ||||
| 867 | { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split | |||
| 868 | { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split | |||
| 869 | ||||
| 870 | { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split | |||
| 871 | { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split | |||
| 872 | ||||
| 873 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
| 874 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 875 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 876 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 877 | ||||
| 878 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
| 879 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 880 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 881 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 882 | ||||
| 883 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
| 884 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 885 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 886 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 887 | ||||
| 888 | { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | |||
| 889 | { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | |||
| 890 | { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) | |||
| 891 | { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add | |||
| 892 | { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/ | |||
| 893 | ||||
| 894 | { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ | |||
| 895 | { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 896 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 897 | { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 898 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 899 | { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 900 | { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 901 | { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 902 | { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 903 | ||||
| 904 | { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 905 | { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 906 | { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 907 | { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
| 908 | ||||
| 909 | { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ | |||
| 910 | { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 911 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 912 | { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 913 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 914 | { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 915 | { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 916 | { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 917 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 918 | ||||
| 919 | { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 920 | { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 921 | { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 922 | { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
| 923 | }; | |||
| 924 | ||||
| 925 | if (ST->hasAVX512()) | |||
| 926 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) | |||
| 927 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 928 | return LT.first * *KindCost; | |||
| 929 | ||||
| 930 | static const CostKindTblEntry AVX2ShiftCostTable[] = { | |||
| 931 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to | |||
| 932 | // customize them to detect the cases where shift amount is a scalar one. | |||
| 933 | { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org) | |||
| 934 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org) | |||
| 935 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org) | |||
| 936 | { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org) | |||
| 937 | { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org) | |||
| 938 | { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org) | |||
| 939 | { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org) | |||
| 940 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org) | |||
| 941 | { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org) | |||
| 942 | { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org) | |||
| 943 | }; | |||
| 944 | ||||
| 945 | if (ST->hasAVX512()) { | |||
| 946 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) | |||
| 947 | // On AVX512, a packed v32i16 shift left by a constant build_vector | |||
| 948 | // is lowered into a vector multiply (vpmullw). | |||
| 949 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
| 950 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 951 | } | |||
| 952 | ||||
| 953 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). | |||
| 954 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { | |||
| 955 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && | |||
| 956 | Op2Info.isConstant()) | |||
| 957 | // On AVX2, a packed v16i16 shift left by a constant build_vector | |||
| 958 | // is lowered into a vector multiply (vpmullw). | |||
| 959 | return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | |||
| 960 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 961 | ||||
| 962 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) | |||
| 963 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 964 | return LT.first * *KindCost; | |||
| 965 | } | |||
| 966 | ||||
| 967 | static const CostKindTblEntry XOPShiftCostTable[] = { | |||
| 968 | // 128bit shifts take 1cy, but right shifts require negation beforehand. | |||
| 969 | { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } }, | |||
| 970 | { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } }, | |||
| 971 | { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } }, | |||
| 972 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } }, | |||
| 973 | { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } }, | |||
| 974 | { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } }, | |||
| 975 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } }, | |||
| 976 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } }, | |||
| 977 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } }, | |||
| 978 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
| 979 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, | |||
| 980 | { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } }, | |||
| 981 | // 256bit shifts require splitting if AVX2 didn't catch them above. | |||
| 982 | { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } }, | |||
| 983 | { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } }, | |||
| 984 | { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } }, | |||
| 985 | { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } }, | |||
| 986 | { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } }, | |||
| 987 | { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } }, | |||
| 988 | { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } }, | |||
| 989 | { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } }, | |||
| 990 | { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } }, | |||
| 991 | { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } }, | |||
| 992 | { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } }, | |||
| 993 | { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } }, | |||
| 994 | }; | |||
| 995 | ||||
| 996 | // Look for XOP lowering tricks. | |||
| 997 | if (ST->hasXOP()) { | |||
| 998 | // If the right shift is constant then we'll fold the negation so | |||
| 999 | // it's as cheap as a left shift. | |||
| 1000 | int ShiftISD = ISD; | |||
| 1001 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) | |||
| 1002 | ShiftISD = ISD::SHL; | |||
| 1003 | if (const auto *Entry = | |||
| 1004 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) | |||
| 1005 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1006 | return LT.first * *KindCost; | |||
| 1007 | } | |||
| 1008 | ||||
| 1009 | if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { | |||
| 1010 | MVT VT = LT.second; | |||
| 1011 | // Vector shift left by non uniform constant can be lowered | |||
| 1012 | // into vector multiply. | |||
| 1013 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || | |||
| 1014 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) | |||
| 1015 | ISD = ISD::MUL; | |||
| 1016 | } | |||
| 1017 | ||||
| 1018 | static const CostKindTblEntry GLMCostTable[] = { | |||
| 1019 | { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss | |||
| 1020 | { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps | |||
| 1021 | { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd | |||
| 1022 | { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd | |||
| 1023 | }; | |||
| 1024 | ||||
| 1025 | if (ST->useGLMDivSqrtCosts()) | |||
| 1026 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) | |||
| 1027 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1028 | return LT.first * *KindCost; | |||
| 1029 | ||||
| 1030 | static const CostKindTblEntry SLMCostTable[] = { | |||
| 1031 | { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld | |||
| 1032 | { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw | |||
| 1033 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd | |||
| 1034 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss | |||
| 1035 | { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd | |||
| 1036 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps | |||
| 1037 | { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss | |||
| 1038 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps | |||
| 1039 | { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd | |||
| 1040 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd | |||
| 1041 | { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd | |||
| 1042 | { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd | |||
| 1043 | // v2i64/v4i64 mul is custom lowered as a series of long: | |||
| 1044 | // multiplies(3), shifts(3) and adds(2) | |||
| 1045 | // slm muldq version throughput is 2 and addq throughput 4 | |||
| 1046 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + | |||
| 1047 | // 3X4 (addq throughput) = 17 | |||
| 1048 | { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } }, | |||
| 1049 | // slm addq\subq throughput is 4 | |||
| 1050 | { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } }, | |||
| 1051 | { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } }, | |||
| 1052 | }; | |||
| 1053 | ||||
| 1054 | if (ST->useSLMArithCosts()) | |||
| 1055 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) | |||
| 1056 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1057 | return LT.first * *KindCost; | |||
| 1058 | ||||
| 1059 | static const CostKindTblEntry AVX2CostTable[] = { | |||
| 1060 | { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence. | |||
| 1061 | { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence. | |||
| 1062 | { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence. | |||
| 1063 | { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. | |||
| 1064 | ||||
| 1065 | { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence. | |||
| 1066 | { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence. | |||
| 1067 | { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence. | |||
| 1068 | { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. | |||
| 1069 | ||||
| 1070 | { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence. | |||
| 1071 | { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence. | |||
| 1072 | { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence. | |||
| 1073 | { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence. | |||
| 1074 | { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence. | |||
| 1075 | { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence. | |||
| 1076 | ||||
| 1077 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb | |||
| 1078 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb | |||
| 1079 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw | |||
| 1080 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw | |||
| 1081 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd | |||
| 1082 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd | |||
| 1083 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq | |||
| 1084 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq | |||
| 1085 | ||||
| 1086 | { ISD::MUL, MVT::v16i16, { 2, 5, 1, 1 } }, // pmullw | |||
| 1087 | { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld | |||
| 1088 | { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld | |||
| 1089 | { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add | |||
| 1090 | { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add | |||
| 1091 | ||||
| 1092 | { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd | |||
| 1093 | { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps | |||
| 1094 | ||||
| 1095 | { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd | |||
| 1096 | { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss | |||
| 1097 | { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd | |||
| 1098 | { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps | |||
| 1099 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd | |||
| 1100 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps | |||
| 1101 | ||||
| 1102 | { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd | |||
| 1103 | { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss | |||
| 1104 | { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd | |||
| 1105 | { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps | |||
| 1106 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd | |||
| 1107 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps | |||
| 1108 | ||||
| 1109 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd | |||
| 1110 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss | |||
| 1111 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd | |||
| 1112 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps | |||
| 1113 | { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd | |||
| 1114 | { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps | |||
| 1115 | ||||
| 1116 | { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss | |||
| 1117 | { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps | |||
| 1118 | { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps | |||
| 1119 | { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd | |||
| 1120 | { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd | |||
| 1121 | { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd | |||
| 1122 | }; | |||
| 1123 | ||||
| 1124 | // Look for AVX2 lowering tricks for custom cases. | |||
| 1125 | if (ST->hasAVX2()) | |||
| 1126 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) | |||
| 1127 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1128 | return LT.first * *KindCost; | |||
| 1129 | ||||
| 1130 | static const CostKindTblEntry AVX1CostTable[] = { | |||
| 1131 | // We don't have to scalarize unsupported ops. We can issue two half-sized | |||
| 1132 | // operations and we only need to extract the upper YMM half. | |||
| 1133 | // Two ops + 1 extract + 1 insert = 4. | |||
| 1134 | { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split | |||
| 1135 | { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split | |||
| 1136 | { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld | |||
| 1137 | { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, | |||
| 1138 | ||||
| 1139 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps | |||
| 1140 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps | |||
| 1141 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps | |||
| 1142 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps | |||
| 1143 | ||||
| 1144 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps | |||
| 1145 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps | |||
| 1146 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps | |||
| 1147 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps | |||
| 1148 | ||||
| 1149 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps | |||
| 1150 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps | |||
| 1151 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps | |||
| 1152 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps | |||
| 1153 | ||||
| 1154 | { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split | |||
| 1155 | { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split | |||
| 1156 | { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split | |||
| 1157 | { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split | |||
| 1158 | { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split | |||
| 1159 | { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split | |||
| 1160 | { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split | |||
| 1161 | { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split | |||
| 1162 | { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq | |||
| 1163 | { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq | |||
| 1164 | ||||
| 1165 | { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence. | |||
| 1166 | { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split. | |||
| 1167 | { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence. | |||
| 1168 | { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split. | |||
| 1169 | { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld | |||
| 1170 | { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split | |||
| 1171 | { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. | |||
| 1172 | { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. | |||
| 1173 | ||||
| 1174 | { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence. | |||
| 1175 | { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split. | |||
| 1176 | { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. | |||
| 1177 | { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. | |||
| 1178 | { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. | |||
| 1179 | { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. | |||
| 1180 | { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. | |||
| 1181 | { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. | |||
| 1182 | ||||
| 1183 | { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence. | |||
| 1184 | { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split. | |||
| 1185 | { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. | |||
| 1186 | { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. | |||
| 1187 | { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. | |||
| 1188 | { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. | |||
| 1189 | { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend. | |||
| 1190 | { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split. | |||
| 1191 | ||||
| 1192 | { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
| 1193 | { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
| 1194 | ||||
| 1195 | { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1196 | { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1197 | { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1198 | { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1199 | { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
| 1200 | { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
| 1201 | ||||
| 1202 | { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1203 | { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1204 | { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1205 | { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ | |||
| 1206 | { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
| 1207 | { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ | |||
| 1208 | ||||
| 1209 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
| 1210 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
| 1211 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
| 1212 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ | |||
| 1213 | { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
| 1214 | { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ | |||
| 1215 | ||||
| 1216 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ | |||
| 1217 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ | |||
| 1218 | { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/ | |||
| 1219 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ | |||
| 1220 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ | |||
| 1221 | { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/ | |||
| 1222 | }; | |||
| 1223 | ||||
| 1224 | if (ST->hasAVX()) | |||
| 1225 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) | |||
| 1226 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1227 | return LT.first * *KindCost; | |||
| 1228 | ||||
| 1229 | static const CostKindTblEntry SSE42CostTable[] = { | |||
| 1230 | { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1231 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1232 | { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1233 | { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1234 | ||||
| 1235 | { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1236 | { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1237 | { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1238 | { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1239 | ||||
| 1240 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1241 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1242 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1243 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1244 | ||||
| 1245 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1246 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1247 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1248 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 1249 | ||||
| 1250 | { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add | |||
| 1251 | }; | |||
| 1252 | ||||
| 1253 | if (ST->hasSSE42()) | |||
| 1254 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) | |||
| 1255 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1256 | return LT.first * *KindCost; | |||
| 1257 | ||||
| 1258 | static const CostKindTblEntry SSE41CostTable[] = { | |||
| 1259 | { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence. | |||
| 1260 | { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence. | |||
| 1261 | { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld | |||
| 1262 | ||||
| 1263 | { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence. | |||
| 1264 | { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. | |||
| 1265 | { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. | |||
| 1266 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | |||
| 1267 | ||||
| 1268 | { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence. | |||
| 1269 | { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. | |||
| 1270 | { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. | |||
| 1271 | { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. | |||
| 1272 | ||||
| 1273 | { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) | |||
| 1274 | }; | |||
| 1275 | ||||
| 1276 | if (ST->hasSSE41()) | |||
| 1277 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) | |||
| 1278 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1279 | return LT.first * *KindCost; | |||
| 1280 | ||||
| 1281 | static const CostKindTblEntry SSE2CostTable[] = { | |||
| 1282 | // We don't correctly identify costs of casts because they are marked as | |||
| 1283 | // custom. | |||
| 1284 | { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence. | |||
| 1285 | { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence. | |||
| 1286 | { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq. | |||
| 1287 | { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | |||
| 1288 | ||||
| 1289 | { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence. | |||
| 1290 | { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. | |||
| 1291 | { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. | |||
| 1292 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. | |||
| 1293 | ||||
| 1294 | { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence. | |||
| 1295 | { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. | |||
| 1296 | { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. | |||
| 1297 | { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence. | |||
| 1298 | ||||
| 1299 | { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand | |||
| 1300 | { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand | |||
| 1301 | { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand | |||
| 1302 | { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand | |||
| 1303 | ||||
| 1304 | { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por | |||
| 1305 | { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por | |||
| 1306 | { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por | |||
| 1307 | { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por | |||
| 1308 | ||||
| 1309 | { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor | |||
| 1310 | { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor | |||
| 1311 | { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor | |||
| 1312 | { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor | |||
| 1313 | ||||
| 1314 | { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq | |||
| 1315 | { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq | |||
| 1316 | ||||
| 1317 | { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw | |||
| 1318 | { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle | |||
| 1319 | { ISD::MUL, MVT::v2i64, { 8, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add | |||
| 1320 | ||||
| 1321 | { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1322 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1323 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1324 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1325 | ||||
| 1326 | { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1327 | { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1328 | { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1329 | { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1330 | ||||
| 1331 | { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1332 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1333 | { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1334 | ||||
| 1335 | { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1336 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1337 | { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1338 | ||||
| 1339 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1340 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ | |||
| 1341 | }; | |||
| 1342 | ||||
| 1343 | if (ST->hasSSE2()) | |||
| 1344 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) | |||
| 1345 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1346 | return LT.first * *KindCost; | |||
| 1347 | ||||
| 1348 | static const CostKindTblEntry SSE1CostTable[] = { | |||
| 1349 | { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1350 | { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1351 | ||||
| 1352 | { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
| 1353 | { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
| 1354 | ||||
| 1355 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1356 | { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1357 | ||||
| 1358 | { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1359 | { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1360 | ||||
| 1361 | { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1362 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1363 | }; | |||
| 1364 | ||||
| 1365 | if (ST->hasSSE1()) | |||
| 1366 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) | |||
| 1367 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1368 | return LT.first * *KindCost; | |||
| 1369 | ||||
| 1370 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets | |||
| 1371 | { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ | |||
| 1372 | { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ | |||
| 1373 | { ISD::MUL, MVT::i64, { 2 } }, // Nehalem from http://www.agner.org/ | |||
| 1374 | }; | |||
| 1375 | ||||
| 1376 | if (ST->is64Bit()) | |||
| 1377 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) | |||
| 1378 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1379 | return LT.first * *KindCost; | |||
| 1380 | ||||
| 1381 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
| 1382 | { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1383 | { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1384 | { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1385 | ||||
| 1386 | { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1387 | { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1388 | { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ | |||
| 1389 | ||||
| 1390 | { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87) | |||
| 1391 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87) | |||
| 1392 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87) | |||
| 1393 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87) | |||
| 1394 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87) | |||
| 1395 | }; | |||
| 1396 | ||||
| 1397 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) | |||
| 1398 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1399 | return LT.first * *KindCost; | |||
| 1400 | ||||
| 1401 | // It is not a good idea to vectorize division. We have to scalarize it and | |||
| 1402 | // in the process we will often end up having to spilling regular | |||
| 1403 | // registers. The overhead of division is going to dominate most kernels | |||
| 1404 | // anyways so try hard to prevent vectorization of division - it is | |||
| 1405 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able | |||
| 1406 | // to hide "20 cycles" for each lane. | |||
| 1407 | if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && | |||
| 1408 | (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || | |||
| 1409 | ISD == ISD::UREM)) { | |||
| 1410 | InstructionCost ScalarCost = | |||
| 1411 | getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, | |||
| 1412 | Op1Info.getNoProps(), Op2Info.getNoProps()); | |||
| 1413 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; | |||
| 1414 | } | |||
| 1415 | ||||
| 1416 | // Handle some basic single instruction code size cases. | |||
| 1417 | if (CostKind == TTI::TCK_CodeSize) { | |||
| 1418 | switch (ISD) { | |||
| 1419 | case ISD::FADD: | |||
| 1420 | case ISD::FSUB: | |||
| 1421 | case ISD::FMUL: | |||
| 1422 | case ISD::FDIV: | |||
| 1423 | case ISD::FNEG: | |||
| 1424 | case ISD::AND: | |||
| 1425 | case ISD::OR: | |||
| 1426 | case ISD::XOR: | |||
| 1427 | return LT.first; | |||
| 1428 | break; | |||
| 1429 | } | |||
| 1430 | } | |||
| 1431 | ||||
| 1432 | // Fallback to the default implementation. | |||
| 1433 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, | |||
| 1434 | Args, CxtI); | |||
| 1435 | } | |||
| 1436 | ||||
| 1437 | InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, | |||
| 1438 | VectorType *BaseTp, | |||
| 1439 | ArrayRef<int> Mask, | |||
| 1440 | TTI::TargetCostKind CostKind, | |||
| 1441 | int Index, VectorType *SubTp, | |||
| 1442 | ArrayRef<const Value *> Args) { | |||
| 1443 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. | |||
| 1444 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. | |||
| 1445 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp); | |||
| 1446 | ||||
| 1447 | Kind = improveShuffleKindFromMask(Kind, Mask); | |||
| 1448 | ||||
| 1449 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. | |||
| 1450 | if (Kind == TTI::SK_Transpose) | |||
| 1451 | Kind = TTI::SK_PermuteTwoSrc; | |||
| 1452 | ||||
| 1453 | // For Broadcasts we are splatting the first element from the first input | |||
| 1454 | // register, so only need to reference that input and all the output | |||
| 1455 | // registers are the same. | |||
| 1456 | if (Kind == TTI::SK_Broadcast) | |||
| 1457 | LT.first = 1; | |||
| 1458 | ||||
| 1459 | // Subvector extractions are free if they start at the beginning of a | |||
| 1460 | // vector and cheap if the subvectors are aligned. | |||
| 1461 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { | |||
| 1462 | int NumElts = LT.second.getVectorNumElements(); | |||
| 1463 | if ((Index % NumElts) == 0) | |||
| 1464 | return 0; | |||
| 1465 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
| 1466 | if (SubLT.second.isVector()) { | |||
| 1467 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
| 1468 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
| 1469 | return SubLT.first; | |||
| 1470 | // Handle some cases for widening legalization. For now we only handle | |||
| 1471 | // cases where the original subvector was naturally aligned and evenly | |||
| 1472 | // fit in its legalized subvector type. | |||
| 1473 | // FIXME: Remove some of the alignment restrictions. | |||
| 1474 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit | |||
| 1475 | // vectors. | |||
| 1476 | int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); | |||
| 1477 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && | |||
| 1478 | (NumSubElts % OrigSubElts) == 0 && | |||
| 1479 | LT.second.getVectorElementType() == | |||
| 1480 | SubLT.second.getVectorElementType() && | |||
| 1481 | LT.second.getVectorElementType().getSizeInBits() == | |||
| 1482 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { | |||
| 1483 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1484, __extension__ __PRETTY_FUNCTION__)) | |||
| 1484 | "Unexpected number of elements!")(static_cast <bool> (NumElts >= NumSubElts && NumElts > OrigSubElts && "Unexpected number of elements!" ) ? void (0) : __assert_fail ("NumElts >= NumSubElts && NumElts > OrigSubElts && \"Unexpected number of elements!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1484, __extension__ __PRETTY_FUNCTION__)); | |||
| 1485 | auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), | |||
| 1486 | LT.second.getVectorNumElements()); | |||
| 1487 | auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), | |||
| 1488 | SubLT.second.getVectorNumElements()); | |||
| 1489 | int ExtractIndex = alignDown((Index % NumElts), NumSubElts); | |||
| 1490 | InstructionCost ExtractCost = | |||
| 1491 | getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt, | |||
| 1492 | CostKind, ExtractIndex, SubTy); | |||
| 1493 | ||||
| 1494 | // If the original size is 32-bits or more, we can use pshufd. Otherwise | |||
| 1495 | // if we have SSSE3 we can use pshufb. | |||
| 1496 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) | |||
| 1497 | return ExtractCost + 1; // pshufd or pshufb | |||
| 1498 | ||||
| 1499 | assert(SubTp->getPrimitiveSizeInBits() == 16 &&(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1500, __extension__ __PRETTY_FUNCTION__)) | |||
| 1500 | "Unexpected vector size")(static_cast <bool> (SubTp->getPrimitiveSizeInBits() == 16 && "Unexpected vector size") ? void (0) : __assert_fail ("SubTp->getPrimitiveSizeInBits() == 16 && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 1500, __extension__ __PRETTY_FUNCTION__)); | |||
| 1501 | ||||
| 1502 | return ExtractCost + 2; // worst case pshufhw + pshufd | |||
| 1503 | } | |||
| 1504 | } | |||
| 1505 | } | |||
| 1506 | ||||
| 1507 | // Subvector insertions are cheap if the subvectors are aligned. | |||
| 1508 | // Note that in general, the insertion starting at the beginning of a vector | |||
| 1509 | // isn't free, because we need to preserve the rest of the wide vector. | |||
| 1510 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { | |||
| 1511 | int NumElts = LT.second.getVectorNumElements(); | |||
| 1512 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp); | |||
| 1513 | if (SubLT.second.isVector()) { | |||
| 1514 | int NumSubElts = SubLT.second.getVectorNumElements(); | |||
| 1515 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) | |||
| 1516 | return SubLT.first; | |||
| 1517 | } | |||
| 1518 | ||||
| 1519 | // If the insertion isn't aligned, treat it like a 2-op shuffle. | |||
| 1520 | Kind = TTI::SK_PermuteTwoSrc; | |||
| 1521 | } | |||
| 1522 | ||||
| 1523 | // Handle some common (illegal) sub-vector types as they are often very cheap | |||
| 1524 | // to shuffle even on targets without PSHUFB. | |||
| 1525 | EVT VT = TLI->getValueType(DL, BaseTp); | |||
| 1526 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && | |||
| 1527 | !ST->hasSSSE3()) { | |||
| 1528 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { | |||
| 1529 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw | |||
| 1530 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw | |||
| 1531 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw | |||
| 1532 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw | |||
| 1533 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck | |||
| 1534 | ||||
| 1535 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw | |||
| 1536 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw | |||
| 1537 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus | |||
| 1538 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck | |||
| 1539 | ||||
| 1540 | {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq | |||
| 1541 | {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq | |||
| 1542 | {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq | |||
| 1543 | {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq | |||
| 1544 | ||||
| 1545 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw | |||
| 1546 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw | |||
| 1547 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw | |||
| 1548 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw | |||
| 1549 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck | |||
| 1550 | ||||
| 1551 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw | |||
| 1552 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw | |||
| 1553 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw | |||
| 1554 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw | |||
| 1555 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck | |||
| 1556 | }; | |||
| 1557 | ||||
| 1558 | if (ST->hasSSE2()) | |||
| 1559 | if (const auto *Entry = | |||
| 1560 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) | |||
| 1561 | return Entry->Cost; | |||
| 1562 | } | |||
| 1563 | ||||
| 1564 | // We are going to permute multiple sources and the result will be in multiple | |||
| 1565 | // destinations. Providing an accurate cost only for splits where the element | |||
| 1566 | // type remains the same. | |||
| 1567 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { | |||
| 1568 | MVT LegalVT = LT.second; | |||
| 1569 | if (LegalVT.isVector() && | |||
| 1570 | LegalVT.getVectorElementType().getSizeInBits() == | |||
| 1571 | BaseTp->getElementType()->getPrimitiveSizeInBits() && | |||
| 1572 | LegalVT.getVectorNumElements() < | |||
| 1573 | cast<FixedVectorType>(BaseTp)->getNumElements()) { | |||
| 1574 | ||||
| 1575 | unsigned VecTySize = DL.getTypeStoreSize(BaseTp); | |||
| 1576 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
| 1577 | // Number of source vectors after legalization: | |||
| 1578 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
| 1579 | // Number of destination vectors after legalization: | |||
| 1580 | InstructionCost NumOfDests = LT.first; | |||
| 1581 | ||||
| 1582 | auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), | |||
| 1583 | LegalVT.getVectorNumElements()); | |||
| 1584 | ||||
| 1585 | if (!Mask.empty() && NumOfDests.isValid()) { | |||
| 1586 | // Try to perform better estimation of the permutation. | |||
| 1587 | // 1. Split the source/destination vectors into real registers. | |||
| 1588 | // 2. Do the mask analysis to identify which real registers are | |||
| 1589 | // permuted. If more than 1 source registers are used for the | |||
| 1590 | // destination register building, the cost for this destination register | |||
| 1591 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one | |||
| 1592 | // source register is used, build mask and calculate the cost as a cost | |||
| 1593 | // of PermuteSingleSrc. | |||
| 1594 | // Also, for the single register permute we try to identify if the | |||
| 1595 | // destination register is just a copy of the source register or the | |||
| 1596 | // copy of the previous destination register (the cost is | |||
| 1597 | // TTI::TCC_Basic). If the source register is just reused, the cost for | |||
| 1598 | // this operation is 0. | |||
| 1599 | unsigned E = *NumOfDests.getValue(); | |||
| 1600 | unsigned NormalizedVF = | |||
| 1601 | LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E); | |||
| 1602 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); | |||
| 1603 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); | |||
| 1604 | SmallVector<int> NormalizedMask(NormalizedVF, UndefMaskElem); | |||
| 1605 | copy(Mask, NormalizedMask.begin()); | |||
| 1606 | unsigned PrevSrcReg = 0; | |||
| 1607 | ArrayRef<int> PrevRegMask; | |||
| 1608 | InstructionCost Cost = 0; | |||
| 1609 | processShuffleMasks( | |||
| 1610 | NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, | |||
| 1611 | [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, | |||
| 1612 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { | |||
| 1613 | if (!ShuffleVectorInst::isIdentityMask(RegMask)) { | |||
| 1614 | // Check if the previous register can be just copied to the next | |||
| 1615 | // one. | |||
| 1616 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || | |||
| 1617 | PrevRegMask != RegMask) | |||
| 1618 | Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, | |||
| 1619 | RegMask, CostKind, 0, nullptr); | |||
| 1620 | else | |||
| 1621 | // Just a copy of previous destination register. | |||
| 1622 | Cost += TTI::TCC_Basic; | |||
| 1623 | return; | |||
| 1624 | } | |||
| 1625 | if (SrcReg != DestReg && | |||
| 1626 | any_of(RegMask, [](int I) { return I != UndefMaskElem; })) { | |||
| 1627 | // Just a copy of the source register. | |||
| 1628 | Cost += TTI::TCC_Basic; | |||
| 1629 | } | |||
| 1630 | PrevSrcReg = SrcReg; | |||
| 1631 | PrevRegMask = RegMask; | |||
| 1632 | }, | |||
| 1633 | [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask, | |||
| 1634 | unsigned /*Unused*/, | |||
| 1635 | unsigned /*Unused*/) { | |||
| 1636 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, | |||
| 1637 | CostKind, 0, nullptr); | |||
| 1638 | }); | |||
| 1639 | return Cost; | |||
| 1640 | } | |||
| 1641 | ||||
| 1642 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; | |||
| 1643 | return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, | |||
| 1644 | std::nullopt, CostKind, 0, nullptr); | |||
| 1645 | } | |||
| 1646 | ||||
| 1647 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); | |||
| 1648 | } | |||
| 1649 | ||||
| 1650 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. | |||
| 1651 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { | |||
| 1652 | // We assume that source and destination have the same vector type. | |||
| 1653 | InstructionCost NumOfDests = LT.first; | |||
| 1654 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; | |||
| 1655 | LT.first = NumOfDests * NumOfShufflesPerDest; | |||
| 1656 | } | |||
| 1657 | ||||
| 1658 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { | |||
| 1659 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb | |||
| 1660 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb | |||
| 1661 | ||||
| 1662 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb | |||
| 1663 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb | |||
| 1664 | ||||
| 1665 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b | |||
| 1666 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b | |||
| 1667 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b | |||
| 1668 | }; | |||
| 1669 | ||||
| 1670 | if (ST->hasVBMI()) | |||
| 1671 | if (const auto *Entry = | |||
| 1672 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) | |||
| 1673 | return LT.first * Entry->Cost; | |||
| 1674 | ||||
| 1675 | static const CostTblEntry AVX512BWShuffleTbl[] = { | |||
| 1676 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw | |||
| 1677 | {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw | |||
| 1678 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb | |||
| 1679 | ||||
| 1680 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw | |||
| 1681 | {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw | |||
| 1682 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw | |||
| 1683 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 | |||
| 1684 | ||||
| 1685 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw | |||
| 1686 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw | |||
| 1687 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw | |||
| 1688 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw | |||
| 1689 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 | |||
| 1690 | ||||
| 1691 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w | |||
| 1692 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w | |||
| 1693 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w | |||
| 1694 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w | |||
| 1695 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 | |||
| 1696 | ||||
| 1697 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw | |||
| 1698 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb | |||
| 1699 | ||||
| 1700 | {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr | |||
| 1701 | {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr | |||
| 1702 | {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr | |||
| 1703 | }; | |||
| 1704 | ||||
| 1705 | if (ST->hasBWI()) | |||
| 1706 | if (const auto *Entry = | |||
| 1707 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) | |||
| 1708 | return LT.first * Entry->Cost; | |||
| 1709 | ||||
| 1710 | static const CostKindTblEntry AVX512ShuffleTbl[] = { | |||
| 1711 | {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd | |||
| 1712 | {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss | |||
| 1713 | {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq | |||
| 1714 | {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd | |||
| 1715 | {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw | |||
| 1716 | {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw | |||
| 1717 | {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb | |||
| 1718 | ||||
| 1719 | {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd | |||
| 1720 | {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps | |||
| 1721 | {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq | |||
| 1722 | {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd | |||
| 1723 | {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca | |||
| 1724 | {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca | |||
| 1725 | {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca | |||
| 1726 | ||||
| 1727 | {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1728 | {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1729 | {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1730 | {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1731 | {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1732 | {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1733 | {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1734 | {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd | |||
| 1735 | {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr | |||
| 1736 | {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr | |||
| 1737 | {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr | |||
| 1738 | ||||
| 1739 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd | |||
| 1740 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd | |||
| 1741 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd | |||
| 1742 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps | |||
| 1743 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps | |||
| 1744 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps | |||
| 1745 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq | |||
| 1746 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq | |||
| 1747 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq | |||
| 1748 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd | |||
| 1749 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd | |||
| 1750 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd | |||
| 1751 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb | |||
| 1752 | ||||
| 1753 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd | |||
| 1754 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps | |||
| 1755 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q | |||
| 1756 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d | |||
| 1757 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd | |||
| 1758 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps | |||
| 1759 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q | |||
| 1760 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d | |||
| 1761 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd | |||
| 1762 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps | |||
| 1763 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q | |||
| 1764 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d | |||
| 1765 | ||||
| 1766 | // FIXME: This just applies the type legalization cost rules above | |||
| 1767 | // assuming these completely split. | |||
| 1768 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } }, | |||
| 1769 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } }, | |||
| 1770 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } }, | |||
| 1771 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } }, | |||
| 1772 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } }, | |||
| 1773 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } }, | |||
| 1774 | ||||
| 1775 | {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq | |||
| 1776 | {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq | |||
| 1777 | {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq | |||
| 1778 | {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd | |||
| 1779 | {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps | |||
| 1780 | {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq | |||
| 1781 | {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd | |||
| 1782 | }; | |||
| 1783 | ||||
| 1784 | if (ST->hasAVX512()) | |||
| 1785 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) | |||
| 1786 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 1787 | return LT.first * *KindCost; | |||
| 1788 | ||||
| 1789 | static const CostTblEntry AVX2ShuffleTbl[] = { | |||
| 1790 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd | |||
| 1791 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps | |||
| 1792 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq | |||
| 1793 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd | |||
| 1794 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw | |||
| 1795 | {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw | |||
| 1796 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb | |||
| 1797 | ||||
| 1798 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd | |||
| 1799 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps | |||
| 1800 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq | |||
| 1801 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd | |||
| 1802 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb | |||
| 1803 | {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb | |||
| 1804 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb | |||
| 1805 | ||||
| 1806 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb | |||
| 1807 | {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb | |||
| 1808 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb | |||
| 1809 | ||||
| 1810 | {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr | |||
| 1811 | {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr | |||
| 1812 | {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr | |||
| 1813 | {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr | |||
| 1814 | {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr | |||
| 1815 | ||||
| 1816 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd | |||
| 1817 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps | |||
| 1818 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq | |||
| 1819 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd | |||
| 1820 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb | |||
| 1821 | // + vpblendvb | |||
| 1822 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb | |||
| 1823 | // + vpblendvb | |||
| 1824 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb | |||
| 1825 | // + vpblendvb | |||
| 1826 | ||||
| 1827 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd | |||
| 1828 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps | |||
| 1829 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd | |||
| 1830 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd | |||
| 1831 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
| 1832 | // + vpblendvb | |||
| 1833 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb | |||
| 1834 | // + vpblendvb | |||
| 1835 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb | |||
| 1836 | // + vpblendvb | |||
| 1837 | }; | |||
| 1838 | ||||
| 1839 | if (ST->hasAVX2()) | |||
| 1840 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) | |||
| 1841 | return LT.first * Entry->Cost; | |||
| 1842 | ||||
| 1843 | static const CostTblEntry XOPShuffleTbl[] = { | |||
| 1844 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd | |||
| 1845 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps | |||
| 1846 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd | |||
| 1847 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps | |||
| 1848 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm | |||
| 1849 | // + vinsertf128 | |||
| 1850 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm | |||
| 1851 | // + vinsertf128 | |||
| 1852 | ||||
| 1853 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm | |||
| 1854 | // + vinsertf128 | |||
| 1855 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm | |||
| 1856 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm | |||
| 1857 | // + vinsertf128 | |||
| 1858 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm | |||
| 1859 | }; | |||
| 1860 | ||||
| 1861 | if (ST->hasXOP()) | |||
| 1862 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) | |||
| 1863 | return LT.first * Entry->Cost; | |||
| 1864 | ||||
| 1865 | static const CostTblEntry AVX1ShuffleTbl[] = { | |||
| 1866 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
| 1867 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
| 1868 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
| 1869 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
| 1870 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
| 1871 | {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 | |||
| 1872 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 | |||
| 1873 | ||||
| 1874 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd | |||
| 1875 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps | |||
| 1876 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd | |||
| 1877 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps | |||
| 1878 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb | |||
| 1879 | // + vinsertf128 | |||
| 1880 | {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb | |||
| 1881 | // + vinsertf128 | |||
| 1882 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb | |||
| 1883 | // + vinsertf128 | |||
| 1884 | ||||
| 1885 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd | |||
| 1886 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd | |||
| 1887 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps | |||
| 1888 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps | |||
| 1889 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor | |||
| 1890 | {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor | |||
| 1891 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor | |||
| 1892 | ||||
| 1893 | {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd | |||
| 1894 | {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd | |||
| 1895 | {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
| 1896 | {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
| 1897 | {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | |||
| 1898 | {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | |||
| 1899 | {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 | |||
| 1900 | ||||
| 1901 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd | |||
| 1902 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd | |||
| 1903 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
| 1904 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
| 1905 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb | |||
| 1906 | // + 2*por + vinsertf128 | |||
| 1907 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb | |||
| 1908 | // + 2*por + vinsertf128 | |||
| 1909 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb | |||
| 1910 | // + 2*por + vinsertf128 | |||
| 1911 | ||||
| 1912 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd | |||
| 1913 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd | |||
| 1914 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps | |||
| 1915 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps | |||
| 1916 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb | |||
| 1917 | // + 4*por + vinsertf128 | |||
| 1918 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb | |||
| 1919 | // + 4*por + vinsertf128 | |||
| 1920 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb | |||
| 1921 | // + 4*por + vinsertf128 | |||
| 1922 | }; | |||
| 1923 | ||||
| 1924 | if (ST->hasAVX()) | |||
| 1925 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) | |||
| 1926 | return LT.first * Entry->Cost; | |||
| 1927 | ||||
| 1928 | static const CostTblEntry SSE41ShuffleTbl[] = { | |||
| 1929 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw | |||
| 1930 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
| 1931 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw | |||
| 1932 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps | |||
| 1933 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw | |||
| 1934 | {TTI::SK_Select, MVT::v8f16, 1}, // pblendw | |||
| 1935 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb | |||
| 1936 | }; | |||
| 1937 | ||||
| 1938 | if (ST->hasSSE41()) | |||
| 1939 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) | |||
| 1940 | return LT.first * Entry->Cost; | |||
| 1941 | ||||
| 1942 | static const CostTblEntry SSSE3ShuffleTbl[] = { | |||
| 1943 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb | |||
| 1944 | {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb | |||
| 1945 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb | |||
| 1946 | ||||
| 1947 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb | |||
| 1948 | {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb | |||
| 1949 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb | |||
| 1950 | ||||
| 1951 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por | |||
| 1952 | {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por | |||
| 1953 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por | |||
| 1954 | ||||
| 1955 | {TTI::SK_Splice, MVT::v4i32, 1}, // palignr | |||
| 1956 | {TTI::SK_Splice, MVT::v4f32, 1}, // palignr | |||
| 1957 | {TTI::SK_Splice, MVT::v8i16, 1}, // palignr | |||
| 1958 | {TTI::SK_Splice, MVT::v8f16, 1}, // palignr | |||
| 1959 | {TTI::SK_Splice, MVT::v16i8, 1}, // palignr | |||
| 1960 | ||||
| 1961 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb | |||
| 1962 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb | |||
| 1963 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb | |||
| 1964 | ||||
| 1965 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por | |||
| 1966 | {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por | |||
| 1967 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por | |||
| 1968 | }; | |||
| 1969 | ||||
| 1970 | if (ST->hasSSSE3()) | |||
| 1971 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) | |||
| 1972 | return LT.first * Entry->Cost; | |||
| 1973 | ||||
| 1974 | static const CostTblEntry SSE2ShuffleTbl[] = { | |||
| 1975 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd | |||
| 1976 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd | |||
| 1977 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd | |||
| 1978 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd | |||
| 1979 | {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd | |||
| 1980 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd | |||
| 1981 | ||||
| 1982 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd | |||
| 1983 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd | |||
| 1984 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd | |||
| 1985 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd | |||
| 1986 | {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd | |||
| 1987 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw | |||
| 1988 | // + 2*pshufd + 2*unpck + packus | |||
| 1989 | ||||
| 1990 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd | |||
| 1991 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd | |||
| 1992 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps | |||
| 1993 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por | |||
| 1994 | {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por | |||
| 1995 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por | |||
| 1996 | ||||
| 1997 | {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd | |||
| 1998 | {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd | |||
| 1999 | {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd} | |||
| 2000 | {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por | |||
| 2001 | {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por | |||
| 2002 | {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por | |||
| 2003 | ||||
| 2004 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd | |||
| 2005 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd | |||
| 2006 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd | |||
| 2007 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw | |||
| 2008 | // + pshufd/unpck | |||
| 2009 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw | |||
| 2010 | // + pshufd/unpck | |||
| 2011 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw | |||
| 2012 | // + 2*pshufd + 2*unpck + 2*packus | |||
| 2013 | ||||
| 2014 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd | |||
| 2015 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd | |||
| 2016 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} | |||
| 2017 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute | |||
| 2018 | { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute | |||
| 2019 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute | |||
| 2020 | }; | |||
| 2021 | ||||
| 2022 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { | |||
| 2023 | {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup | |||
| 2024 | }; | |||
| 2025 | ||||
| 2026 | if (ST->hasSSE2()) { | |||
| 2027 | bool IsLoad = | |||
| 2028 | llvm::any_of(Args, [](const auto &V) { return isa<LoadInst>(V); }); | |||
| 2029 | if (ST->hasSSE3() && IsLoad) | |||
| 2030 | if (const auto *Entry = | |||
| 2031 | CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { | |||
| 2032 | assert(isLegalBroadcastLoad(BaseTp->getElementType(),(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)) | |||
| 2033 | LT.second.getVectorElementCount()) &&(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)) | |||
| 2034 | "Table entry missing from isLegalBroadcastLoad()")(static_cast <bool> (isLegalBroadcastLoad(BaseTp->getElementType (), LT.second.getVectorElementCount()) && "Table entry missing from isLegalBroadcastLoad()" ) ? void (0) : __assert_fail ("isLegalBroadcastLoad(BaseTp->getElementType(), LT.second.getVectorElementCount()) && \"Table entry missing from isLegalBroadcastLoad()\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2034, __extension__ __PRETTY_FUNCTION__)); | |||
| 2035 | return LT.first * Entry->Cost; | |||
| 2036 | } | |||
| 2037 | ||||
| 2038 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) | |||
| 2039 | return LT.first * Entry->Cost; | |||
| 2040 | } | |||
| 2041 | ||||
| 2042 | static const CostTblEntry SSE1ShuffleTbl[] = { | |||
| 2043 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps | |||
| 2044 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps | |||
| 2045 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps | |||
| 2046 | { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps | |||
| 2047 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps | |||
| 2048 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps | |||
| 2049 | }; | |||
| 2050 | ||||
| 2051 | if (ST->hasSSE1()) | |||
| 2052 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) | |||
| 2053 | return LT.first * Entry->Cost; | |||
| 2054 | ||||
| 2055 | return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); | |||
| 2056 | } | |||
| 2057 | ||||
| 2058 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, | |||
| 2059 | Type *Src, | |||
| 2060 | TTI::CastContextHint CCH, | |||
| 2061 | TTI::TargetCostKind CostKind, | |||
| 2062 | const Instruction *I) { | |||
| 2063 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 2064 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 2064, __extension__ __PRETTY_FUNCTION__)); | |||
| 2065 | ||||
| 2066 | // TODO: Allow non-throughput costs that aren't binary. | |||
| 2067 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { | |||
| 2068 | if (CostKind != TTI::TCK_RecipThroughput) | |||
| 2069 | return Cost == 0 ? 0 : 1; | |||
| 2070 | return Cost; | |||
| 2071 | }; | |||
| 2072 | ||||
| 2073 | // The cost tables include both specific, custom (non-legal) src/dst type | |||
| 2074 | // conversions and generic, legalized types. We test for customs first, before | |||
| 2075 | // falling back to legalization. | |||
| 2076 | // FIXME: Need a better design of the cost table to handle non-simple types of | |||
| 2077 | // potential massive combinations (elem_num x src_type x dst_type). | |||
| 2078 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { | |||
| 2079 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
| 2080 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, | |||
| 2081 | ||||
| 2082 | // Mask sign extend has an instruction. | |||
| 2083 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
| 2084 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | |||
| 2085 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
| 2086 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | |||
| 2087 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
| 2088 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | |||
| 2089 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
| 2090 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | |||
| 2091 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
| 2092 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | |||
| 2093 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
| 2094 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
| 2095 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
| 2096 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
| 2097 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, | |||
| 2098 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, | |||
| 2099 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, | |||
| 2100 | ||||
| 2101 | // Mask zero extend is a sext + shift. | |||
| 2102 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
| 2103 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | |||
| 2104 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
| 2105 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | |||
| 2106 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
| 2107 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | |||
| 2108 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
| 2109 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | |||
| 2110 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
| 2111 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | |||
| 2112 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
| 2113 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
| 2114 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
| 2115 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
| 2116 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, | |||
| 2117 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, | |||
| 2118 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, | |||
| 2119 | ||||
| 2120 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | |||
| 2121 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | |||
| 2122 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | |||
| 2123 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | |||
| 2124 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | |||
| 2125 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | |||
| 2126 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | |||
| 2127 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | |||
| 2128 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | |||
| 2129 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | |||
| 2130 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | |||
| 2131 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | |||
| 2132 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | |||
| 2133 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | |||
| 2134 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, | |||
| 2135 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, | |||
| 2136 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, | |||
| 2137 | ||||
| 2138 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, | |||
| 2139 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm | |||
| 2140 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb | |||
| 2141 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb | |||
| 2142 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb | |||
| 2143 | }; | |||
| 2144 | ||||
| 2145 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { | |||
| 2146 | // Mask sign extend has an instruction. | |||
| 2147 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | |||
| 2148 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | |||
| 2149 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | |||
| 2150 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | |||
| 2151 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | |||
| 2152 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, | |||
| 2153 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, | |||
| 2154 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, | |||
| 2155 | ||||
| 2156 | // Mask zero extend is a sext + shift. | |||
| 2157 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | |||
| 2158 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | |||
| 2159 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | |||
| 2160 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | |||
| 2161 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | |||
| 2162 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, | |||
| 2163 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, | |||
| 2164 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, | |||
| 2165 | ||||
| 2166 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | |||
| 2167 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | |||
| 2168 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | |||
| 2169 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | |||
| 2170 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
| 2171 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, | |||
| 2172 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, | |||
| 2173 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, | |||
| 2174 | ||||
| 2175 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
| 2176 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
| 2177 | ||||
| 2178 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, | |||
| 2179 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, | |||
| 2180 | ||||
| 2181 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
| 2182 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
| 2183 | ||||
| 2184 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, | |||
| 2185 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, | |||
| 2186 | }; | |||
| 2187 | ||||
| 2188 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and | |||
| 2189 | // 256-bit wide vectors. | |||
| 2190 | ||||
| 2191 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { | |||
| 2192 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, | |||
| 2193 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, | |||
| 2194 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, | |||
| 2195 | ||||
| 2196 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
| 2197 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
| 2198 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
| 2199 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd | |||
| 2200 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
| 2201 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
| 2202 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
| 2203 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd | |||
| 2204 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd | |||
| 2205 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd | |||
| 2206 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd | |||
| 2207 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd | |||
| 2208 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq | |||
| 2209 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq | |||
| 2210 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq | |||
| 2211 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb | |||
| 2212 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb | |||
| 2213 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb | |||
| 2214 | { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb | |||
| 2215 | { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb | |||
| 2216 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw | |||
| 2217 | { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw | |||
| 2218 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb | |||
| 2219 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb | |||
| 2220 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb | |||
| 2221 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb | |||
| 2222 | { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb | |||
| 2223 | { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb | |||
| 2224 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw | |||
| 2225 | { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw | |||
| 2226 | { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw | |||
| 2227 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd | |||
| 2228 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd | |||
| 2229 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb | |||
| 2230 | ||||
| 2231 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 | |||
| 2232 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, | |||
| 2233 | { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, | |||
| 2234 | ||||
| 2235 | // Sign extend is zmm vpternlogd+vptruncdb. | |||
| 2236 | // Zero extend is zmm broadcast load+vptruncdw. | |||
| 2237 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, | |||
| 2238 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, | |||
| 2239 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, | |||
| 2240 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, | |||
| 2241 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, | |||
| 2242 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, | |||
| 2243 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, | |||
| 2244 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, | |||
| 2245 | ||||
| 2246 | // Sign extend is zmm vpternlogd+vptruncdw. | |||
| 2247 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. | |||
| 2248 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, | |||
| 2249 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
| 2250 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, | |||
| 2251 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
| 2252 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, | |||
| 2253 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
| 2254 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, | |||
| 2255 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
| 2256 | ||||
| 2257 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd | |||
| 2258 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld | |||
| 2259 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd | |||
| 2260 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld | |||
| 2261 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd | |||
| 2262 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld | |||
| 2263 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq | |||
| 2264 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq | |||
| 2265 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq | |||
| 2266 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq | |||
| 2267 | ||||
| 2268 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd | |||
| 2269 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld | |||
| 2270 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq | |||
| 2271 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq | |||
| 2272 | ||||
| 2273 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
| 2274 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, | |||
| 2275 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
| 2276 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, | |||
| 2277 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
| 2278 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, | |||
| 2279 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
| 2280 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, | |||
| 2281 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
| 2282 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, | |||
| 2283 | ||||
| 2284 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
| 2285 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right | |||
| 2286 | ||||
| 2287 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
| 2288 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
| 2289 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | |||
| 2290 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | |||
| 2291 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
| 2292 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | |||
| 2293 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
| 2294 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
| 2295 | ||||
| 2296 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, | |||
| 2297 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, | |||
| 2298 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, | |||
| 2299 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, | |||
| 2300 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, | |||
| 2301 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, | |||
| 2302 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, | |||
| 2303 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, | |||
| 2304 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, | |||
| 2305 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, | |||
| 2306 | ||||
| 2307 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | |||
| 2308 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, | |||
| 2309 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, | |||
| 2310 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, | |||
| 2311 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, | |||
| 2312 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
| 2313 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, | |||
| 2314 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, | |||
| 2315 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, | |||
| 2316 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
| 2317 | { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, | |||
| 2318 | ||||
| 2319 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
| 2320 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, | |||
| 2321 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, | |||
| 2322 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, | |||
| 2323 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, | |||
| 2324 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, | |||
| 2325 | }; | |||
| 2326 | ||||
| 2327 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { | |||
| 2328 | // Mask sign extend has an instruction. | |||
| 2329 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, | |||
| 2330 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, | |||
| 2331 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, | |||
| 2332 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, | |||
| 2333 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, | |||
| 2334 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, | |||
| 2335 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, | |||
| 2336 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, | |||
| 2337 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, | |||
| 2338 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, | |||
| 2339 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, | |||
| 2340 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, | |||
| 2341 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
| 2342 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, | |||
| 2343 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, | |||
| 2344 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, | |||
| 2345 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, | |||
| 2346 | ||||
| 2347 | // Mask zero extend is a sext + shift. | |||
| 2348 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, | |||
| 2349 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, | |||
| 2350 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, | |||
| 2351 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, | |||
| 2352 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, | |||
| 2353 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, | |||
| 2354 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, | |||
| 2355 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, | |||
| 2356 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, | |||
| 2357 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, | |||
| 2358 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, | |||
| 2359 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, | |||
| 2360 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, | |||
| 2361 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, | |||
| 2362 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, | |||
| 2363 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, | |||
| 2364 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, | |||
| 2365 | ||||
| 2366 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, | |||
| 2367 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, | |||
| 2368 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, | |||
| 2369 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, | |||
| 2370 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, | |||
| 2371 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, | |||
| 2372 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, | |||
| 2373 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, | |||
| 2374 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, | |||
| 2375 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, | |||
| 2376 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, | |||
| 2377 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, | |||
| 2378 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, | |||
| 2379 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, | |||
| 2380 | { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, | |||
| 2381 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, | |||
| 2382 | { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, | |||
| 2383 | ||||
| 2384 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, | |||
| 2385 | }; | |||
| 2386 | ||||
| 2387 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { | |||
| 2388 | // Mask sign extend has an instruction. | |||
| 2389 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, | |||
| 2390 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, | |||
| 2391 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, | |||
| 2392 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, | |||
| 2393 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, | |||
| 2394 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, | |||
| 2395 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, | |||
| 2396 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, | |||
| 2397 | ||||
| 2398 | // Mask zero extend is a sext + shift. | |||
| 2399 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, | |||
| 2400 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, | |||
| 2401 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, | |||
| 2402 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, | |||
| 2403 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, | |||
| 2404 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, | |||
| 2405 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, | |||
| 2406 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, | |||
| 2407 | ||||
| 2408 | { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, | |||
| 2409 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, | |||
| 2410 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, | |||
| 2411 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, | |||
| 2412 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, | |||
| 2413 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, | |||
| 2414 | { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, | |||
| 2415 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
| 2416 | ||||
| 2417 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
| 2418 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
| 2419 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
| 2420 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
| 2421 | ||||
| 2422 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, | |||
| 2423 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, | |||
| 2424 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, | |||
| 2425 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, | |||
| 2426 | ||||
| 2427 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, | |||
| 2428 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
| 2429 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
| 2430 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
| 2431 | ||||
| 2432 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, | |||
| 2433 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, | |||
| 2434 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, | |||
| 2435 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, | |||
| 2436 | }; | |||
| 2437 | ||||
| 2438 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { | |||
| 2439 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd | |||
| 2440 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd | |||
| 2441 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd | |||
| 2442 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 | |||
| 2443 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq | |||
| 2444 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq | |||
| 2445 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq | |||
| 2446 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 | |||
| 2447 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd | |||
| 2448 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd | |||
| 2449 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd | |||
| 2450 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd | |||
| 2451 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq | |||
| 2452 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq | |||
| 2453 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd | |||
| 2454 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb | |||
| 2455 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw | |||
| 2456 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb | |||
| 2457 | ||||
| 2458 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb | |||
| 2459 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb | |||
| 2460 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, | |||
| 2461 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, | |||
| 2462 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, | |||
| 2463 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, | |||
| 2464 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, | |||
| 2465 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, | |||
| 2466 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, | |||
| 2467 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, | |||
| 2468 | ||||
| 2469 | // sign extend is vpcmpeq+maskedmove+vpmovdw | |||
| 2470 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw | |||
| 2471 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, | |||
| 2472 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, | |||
| 2473 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, | |||
| 2474 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, | |||
| 2475 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, | |||
| 2476 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, | |||
| 2477 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, | |||
| 2478 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, | |||
| 2479 | ||||
| 2480 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd | |||
| 2481 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld | |||
| 2482 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd | |||
| 2483 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld | |||
| 2484 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd | |||
| 2485 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld | |||
| 2486 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd | |||
| 2487 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld | |||
| 2488 | ||||
| 2489 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq | |||
| 2490 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq | |||
| 2491 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq | |||
| 2492 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq | |||
| 2493 | ||||
| 2494 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | |||
| 2495 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, | |||
| 2496 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | |||
| 2497 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, | |||
| 2498 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
| 2499 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, | |||
| 2500 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | |||
| 2501 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, | |||
| 2502 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
| 2503 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, | |||
| 2504 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
| 2505 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, | |||
| 2506 | ||||
| 2507 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
| 2508 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | |||
| 2509 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
| 2510 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | |||
| 2511 | ||||
| 2512 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, | |||
| 2513 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
| 2514 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
| 2515 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, | |||
| 2516 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
| 2517 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, | |||
| 2518 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, | |||
| 2519 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
| 2520 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
| 2521 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
| 2522 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, | |||
| 2523 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
| 2524 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, | |||
| 2525 | ||||
| 2526 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
| 2527 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, | |||
| 2528 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, | |||
| 2529 | ||||
| 2530 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, | |||
| 2531 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, | |||
| 2532 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
| 2533 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, | |||
| 2534 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
| 2535 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
| 2536 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, | |||
| 2537 | }; | |||
| 2538 | ||||
| 2539 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { | |||
| 2540 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
| 2541 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, | |||
| 2542 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
| 2543 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, | |||
| 2544 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
| 2545 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, | |||
| 2546 | ||||
| 2547 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | |||
| 2548 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, | |||
| 2549 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | |||
| 2550 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, | |||
| 2551 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
| 2552 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, | |||
| 2553 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | |||
| 2554 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, | |||
| 2555 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
| 2556 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, | |||
| 2557 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
| 2558 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, | |||
| 2559 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
| 2560 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, | |||
| 2561 | ||||
| 2562 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, | |||
| 2563 | ||||
| 2564 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, | |||
| 2565 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, | |||
| 2566 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, | |||
| 2567 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, | |||
| 2568 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, | |||
| 2569 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, | |||
| 2570 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, | |||
| 2571 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, | |||
| 2572 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, | |||
| 2573 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, | |||
| 2574 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, | |||
| 2575 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, | |||
| 2576 | ||||
| 2577 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, | |||
| 2578 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, | |||
| 2579 | ||||
| 2580 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, | |||
| 2581 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, | |||
| 2582 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, | |||
| 2583 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, | |||
| 2584 | ||||
| 2585 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, | |||
| 2586 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, | |||
| 2587 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, | |||
| 2588 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | |||
| 2589 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
| 2590 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, | |||
| 2591 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, | |||
| 2592 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, | |||
| 2593 | ||||
| 2594 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | |||
| 2595 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | |||
| 2596 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | |||
| 2597 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
| 2598 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, | |||
| 2599 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, | |||
| 2600 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, | |||
| 2601 | ||||
| 2602 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, | |||
| 2603 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, | |||
| 2604 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, | |||
| 2605 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, | |||
| 2606 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, | |||
| 2607 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, | |||
| 2608 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, | |||
| 2609 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
| 2610 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
| 2611 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | |||
| 2612 | }; | |||
| 2613 | ||||
| 2614 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { | |||
| 2615 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, | |||
| 2616 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, | |||
| 2617 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, | |||
| 2618 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, | |||
| 2619 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
| 2620 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, | |||
| 2621 | ||||
| 2622 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | |||
| 2623 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, | |||
| 2624 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | |||
| 2625 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, | |||
| 2626 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
| 2627 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, | |||
| 2628 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | |||
| 2629 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, | |||
| 2630 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
| 2631 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, | |||
| 2632 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
| 2633 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, | |||
| 2634 | ||||
| 2635 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, | |||
| 2636 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, | |||
| 2637 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, | |||
| 2638 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, | |||
| 2639 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, | |||
| 2640 | ||||
| 2641 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, | |||
| 2642 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, | |||
| 2643 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb | |||
| 2644 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, | |||
| 2645 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
| 2646 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, | |||
| 2647 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw | |||
| 2648 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, | |||
| 2649 | ||||
| 2650 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, | |||
| 2651 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, | |||
| 2652 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, | |||
| 2653 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | |||
| 2654 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | |||
| 2655 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
| 2656 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | |||
| 2657 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
| 2658 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, | |||
| 2659 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, | |||
| 2660 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, | |||
| 2661 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, | |||
| 2662 | ||||
| 2663 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, | |||
| 2664 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, | |||
| 2665 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, | |||
| 2666 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, | |||
| 2667 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, | |||
| 2668 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, | |||
| 2669 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, | |||
| 2670 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, | |||
| 2671 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, | |||
| 2672 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
| 2673 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, | |||
| 2674 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, | |||
| 2675 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, | |||
| 2676 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, | |||
| 2677 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, | |||
| 2678 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, | |||
| 2679 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, | |||
| 2680 | ||||
| 2681 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
| 2682 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, | |||
| 2683 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, | |||
| 2684 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, | |||
| 2685 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, | |||
| 2686 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, | |||
| 2687 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, | |||
| 2688 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, | |||
| 2689 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, | |||
| 2690 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, | |||
| 2691 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, | |||
| 2692 | ||||
| 2693 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, | |||
| 2694 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, | |||
| 2695 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, | |||
| 2696 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, | |||
| 2697 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, | |||
| 2698 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, | |||
| 2699 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, | |||
| 2700 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, | |||
| 2701 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, | |||
| 2702 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
| 2703 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, | |||
| 2704 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, | |||
| 2705 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, | |||
| 2706 | ||||
| 2707 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, | |||
| 2708 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, | |||
| 2709 | }; | |||
| 2710 | ||||
| 2711 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { | |||
| 2712 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | |||
| 2713 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, | |||
| 2714 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | |||
| 2715 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, | |||
| 2716 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
| 2717 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
| 2718 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | |||
| 2719 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, | |||
| 2720 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
| 2721 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
| 2722 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
| 2723 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
| 2724 | ||||
| 2725 | // These truncates end up widening elements. | |||
| 2726 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ | |||
| 2727 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ | |||
| 2728 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD | |||
| 2729 | ||||
| 2730 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, | |||
| 2731 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, | |||
| 2732 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, | |||
| 2733 | ||||
| 2734 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, | |||
| 2735 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, | |||
| 2736 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, | |||
| 2737 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, | |||
| 2738 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | |||
| 2739 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
| 2740 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | |||
| 2741 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
| 2742 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, | |||
| 2743 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, | |||
| 2744 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, | |||
| 2745 | ||||
| 2746 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, | |||
| 2747 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, | |||
| 2748 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, | |||
| 2749 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, | |||
| 2750 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, | |||
| 2751 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, | |||
| 2752 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, | |||
| 2753 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, | |||
| 2754 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, | |||
| 2755 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | |||
| 2756 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, | |||
| 2757 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, | |||
| 2758 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, | |||
| 2759 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, | |||
| 2760 | ||||
| 2761 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, | |||
| 2762 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, | |||
| 2763 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, | |||
| 2764 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, | |||
| 2765 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, | |||
| 2766 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, | |||
| 2767 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, | |||
| 2768 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, | |||
| 2769 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, | |||
| 2770 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, | |||
| 2771 | ||||
| 2772 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, | |||
| 2773 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | |||
| 2774 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, | |||
| 2775 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, | |||
| 2776 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, | |||
| 2777 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, | |||
| 2778 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, | |||
| 2779 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, | |||
| 2780 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, | |||
| 2781 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
| 2782 | }; | |||
| 2783 | ||||
| 2784 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { | |||
| 2785 | // These are somewhat magic numbers justified by comparing the | |||
| 2786 | // output of llvm-mca for our various supported scheduler models | |||
| 2787 | // and basing it off the worst case scenario. | |||
| 2788 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, | |||
| 2789 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, | |||
| 2790 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, | |||
| 2791 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, | |||
| 2792 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, | |||
| 2793 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | |||
| 2794 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, | |||
| 2795 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | |||
| 2796 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, | |||
| 2797 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, | |||
| 2798 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, | |||
| 2799 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, | |||
| 2800 | ||||
| 2801 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, | |||
| 2802 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, | |||
| 2803 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, | |||
| 2804 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, | |||
| 2805 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, | |||
| 2806 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, | |||
| 2807 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, | |||
| 2808 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, | |||
| 2809 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, | |||
| 2810 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, | |||
| 2811 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, | |||
| 2812 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, | |||
| 2813 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, | |||
| 2814 | ||||
| 2815 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, | |||
| 2816 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, | |||
| 2817 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, | |||
| 2818 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, | |||
| 2819 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, | |||
| 2820 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, | |||
| 2821 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, | |||
| 2822 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, | |||
| 2823 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, | |||
| 2824 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, | |||
| 2825 | ||||
| 2826 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, | |||
| 2827 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, | |||
| 2828 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, | |||
| 2829 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, | |||
| 2830 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, | |||
| 2831 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, | |||
| 2832 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, | |||
| 2833 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, | |||
| 2834 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, | |||
| 2835 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, | |||
| 2836 | ||||
| 2837 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | |||
| 2838 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, | |||
| 2839 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, | |||
| 2840 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, | |||
| 2841 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, | |||
| 2842 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, | |||
| 2843 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, | |||
| 2844 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, | |||
| 2845 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, | |||
| 2846 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, | |||
| 2847 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, | |||
| 2848 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, | |||
| 2849 | ||||
| 2850 | // These truncates are really widening elements. | |||
| 2851 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD | |||
| 2852 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ | |||
| 2853 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD | |||
| 2854 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD | |||
| 2855 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD | |||
| 2856 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW | |||
| 2857 | ||||
| 2858 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB | |||
| 2859 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, | |||
| 2860 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB | |||
| 2861 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, | |||
| 2862 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, | |||
| 2863 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, | |||
| 2864 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, | |||
| 2865 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, | |||
| 2866 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB | |||
| 2867 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW | |||
| 2868 | { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD | |||
| 2869 | }; | |||
| 2870 | ||||
| 2871 | // Attempt to map directly to (simple) MVT types to let us match custom entries. | |||
| 2872 | EVT SrcTy = TLI->getValueType(DL, Src); | |||
| 2873 | EVT DstTy = TLI->getValueType(DL, Dst); | |||
| 2874 | ||||
| 2875 | // The function getSimpleVT only handles simple value types. | |||
| 2876 | if (SrcTy.isSimple() && DstTy.isSimple()) { | |||
| 2877 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); | |||
| 2878 | MVT SimpleDstTy = DstTy.getSimpleVT(); | |||
| 2879 | ||||
| 2880 | if (ST->useAVX512Regs()) { | |||
| 2881 | if (ST->hasBWI()) | |||
| 2882 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2883 | AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
| 2884 | return AdjustCost(Entry->Cost); | |||
| 2885 | ||||
| 2886 | if (ST->hasDQI()) | |||
| 2887 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2888 | AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
| 2889 | return AdjustCost(Entry->Cost); | |||
| 2890 | ||||
| 2891 | if (ST->hasAVX512()) | |||
| 2892 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2893 | AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
| 2894 | return AdjustCost(Entry->Cost); | |||
| 2895 | } | |||
| 2896 | ||||
| 2897 | if (ST->hasBWI()) | |||
| 2898 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2899 | AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
| 2900 | return AdjustCost(Entry->Cost); | |||
| 2901 | ||||
| 2902 | if (ST->hasDQI()) | |||
| 2903 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2904 | AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) | |||
| 2905 | return AdjustCost(Entry->Cost); | |||
| 2906 | ||||
| 2907 | if (ST->hasAVX512()) | |||
| 2908 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | |||
| 2909 | SimpleDstTy, SimpleSrcTy)) | |||
| 2910 | return AdjustCost(Entry->Cost); | |||
| 2911 | ||||
| 2912 | if (ST->hasAVX2()) { | |||
| 2913 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
| 2914 | SimpleDstTy, SimpleSrcTy)) | |||
| 2915 | return AdjustCost(Entry->Cost); | |||
| 2916 | } | |||
| 2917 | ||||
| 2918 | if (ST->hasAVX()) { | |||
| 2919 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
| 2920 | SimpleDstTy, SimpleSrcTy)) | |||
| 2921 | return AdjustCost(Entry->Cost); | |||
| 2922 | } | |||
| 2923 | ||||
| 2924 | if (ST->hasSSE41()) { | |||
| 2925 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
| 2926 | SimpleDstTy, SimpleSrcTy)) | |||
| 2927 | return AdjustCost(Entry->Cost); | |||
| 2928 | } | |||
| 2929 | ||||
| 2930 | if (ST->hasSSE2()) { | |||
| 2931 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
| 2932 | SimpleDstTy, SimpleSrcTy)) | |||
| 2933 | return AdjustCost(Entry->Cost); | |||
| 2934 | } | |||
| 2935 | } | |||
| 2936 | ||||
| 2937 | // Fall back to legalized types. | |||
| 2938 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Src); | |||
| 2939 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Dst); | |||
| 2940 | ||||
| 2941 | // If we're truncating to the same legalized type - just assume its free. | |||
| 2942 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) | |||
| 2943 | return TTI::TCC_Free; | |||
| 2944 | ||||
| 2945 | if (ST->useAVX512Regs()) { | |||
| 2946 | if (ST->hasBWI()) | |||
| 2947 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2948 | AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
| 2949 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2950 | ||||
| 2951 | if (ST->hasDQI()) | |||
| 2952 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2953 | AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
| 2954 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2955 | ||||
| 2956 | if (ST->hasAVX512()) | |||
| 2957 | if (const auto *Entry = ConvertCostTableLookup( | |||
| 2958 | AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) | |||
| 2959 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2960 | } | |||
| 2961 | ||||
| 2962 | if (ST->hasBWI()) | |||
| 2963 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, | |||
| 2964 | LTDest.second, LTSrc.second)) | |||
| 2965 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2966 | ||||
| 2967 | if (ST->hasDQI()) | |||
| 2968 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, | |||
| 2969 | LTDest.second, LTSrc.second)) | |||
| 2970 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2971 | ||||
| 2972 | if (ST->hasAVX512()) | |||
| 2973 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, | |||
| 2974 | LTDest.second, LTSrc.second)) | |||
| 2975 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2976 | ||||
| 2977 | if (ST->hasAVX2()) | |||
| 2978 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, | |||
| 2979 | LTDest.second, LTSrc.second)) | |||
| 2980 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2981 | ||||
| 2982 | if (ST->hasAVX()) | |||
| 2983 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, | |||
| 2984 | LTDest.second, LTSrc.second)) | |||
| 2985 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2986 | ||||
| 2987 | if (ST->hasSSE41()) | |||
| 2988 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, | |||
| 2989 | LTDest.second, LTSrc.second)) | |||
| 2990 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2991 | ||||
| 2992 | if (ST->hasSSE2()) | |||
| 2993 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, | |||
| 2994 | LTDest.second, LTSrc.second)) | |||
| 2995 | return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); | |||
| 2996 | ||||
| 2997 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for | |||
| 2998 | // sitofp. | |||
| 2999 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && | |||
| 3000 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { | |||
| 3001 | Type *ExtSrc = Src->getWithNewBitWidth(32); | |||
| 3002 | unsigned ExtOpc = | |||
| 3003 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; | |||
| 3004 | ||||
| 3005 | // For scalar loads the extend would be free. | |||
| 3006 | InstructionCost ExtCost = 0; | |||
| 3007 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) | |||
| 3008 | ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); | |||
| 3009 | ||||
| 3010 | return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, | |||
| 3011 | TTI::CastContextHint::None, CostKind); | |||
| 3012 | } | |||
| 3013 | ||||
| 3014 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi | |||
| 3015 | // i32. | |||
| 3016 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && | |||
| 3017 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { | |||
| 3018 | Type *TruncDst = Dst->getWithNewBitWidth(32); | |||
| 3019 | return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + | |||
| 3020 | getCastInstrCost(Instruction::Trunc, Dst, TruncDst, | |||
| 3021 | TTI::CastContextHint::None, CostKind); | |||
| 3022 | } | |||
| 3023 | ||||
| 3024 | return AdjustCost( | |||
| 3025 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); | |||
| 3026 | } | |||
| 3027 | ||||
| 3028 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, | |||
| 3029 | Type *CondTy, | |||
| 3030 | CmpInst::Predicate VecPred, | |||
| 3031 | TTI::TargetCostKind CostKind, | |||
| 3032 | const Instruction *I) { | |||
| 3033 | // Early out if this type isn't scalar/vector integer/float. | |||
| 3034 | if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) | |||
| 3035 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, | |||
| 3036 | I); | |||
| 3037 | ||||
| 3038 | // Legalize the type. | |||
| 3039 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
| 3040 | ||||
| 3041 | MVT MTy = LT.second; | |||
| 3042 | ||||
| 3043 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 3044 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 3044, __extension__ __PRETTY_FUNCTION__)); | |||
| 3045 | ||||
| 3046 | InstructionCost ExtraCost = 0; | |||
| 3047 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { | |||
| 3048 | // Some vector comparison predicates cost extra instructions. | |||
| 3049 | // TODO: Should we invert this and assume worst case cmp costs | |||
| 3050 | // and reduce for particular predicates? | |||
| 3051 | if (MTy.isVector() && | |||
| 3052 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || | |||
| 3053 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || | |||
| 3054 | ST->hasBWI())) { | |||
| 3055 | // Fallback to I if a specific predicate wasn't specified. | |||
| 3056 | CmpInst::Predicate Pred = VecPred; | |||
| 3057 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || | |||
| 3058 | Pred == CmpInst::BAD_FCMP_PREDICATE)) | |||
| 3059 | Pred = cast<CmpInst>(I)->getPredicate(); | |||
| 3060 | ||||
| 3061 | switch (Pred) { | |||
| 3062 | case CmpInst::Predicate::ICMP_NE: | |||
| 3063 | // xor(cmpeq(x,y),-1) | |||
| 3064 | ExtraCost = 1; | |||
| 3065 | break; | |||
| 3066 | case CmpInst::Predicate::ICMP_SGE: | |||
| 3067 | case CmpInst::Predicate::ICMP_SLE: | |||
| 3068 | // xor(cmpgt(x,y),-1) | |||
| 3069 | ExtraCost = 1; | |||
| 3070 | break; | |||
| 3071 | case CmpInst::Predicate::ICMP_ULT: | |||
| 3072 | case CmpInst::Predicate::ICMP_UGT: | |||
| 3073 | // cmpgt(xor(x,signbit),xor(y,signbit)) | |||
| 3074 | // xor(cmpeq(pmaxu(x,y),x),-1) | |||
| 3075 | ExtraCost = 2; | |||
| 3076 | break; | |||
| 3077 | case CmpInst::Predicate::ICMP_ULE: | |||
| 3078 | case CmpInst::Predicate::ICMP_UGE: | |||
| 3079 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || | |||
| 3080 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { | |||
| 3081 | // cmpeq(psubus(x,y),0) | |||
| 3082 | // cmpeq(pminu(x,y),x) | |||
| 3083 | ExtraCost = 1; | |||
| 3084 | } else { | |||
| 3085 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) | |||
| 3086 | ExtraCost = 3; | |||
| 3087 | } | |||
| 3088 | break; | |||
| 3089 | case CmpInst::Predicate::FCMP_ONE: | |||
| 3090 | case CmpInst::Predicate::FCMP_UEQ: | |||
| 3091 | // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. | |||
| 3092 | // Use FCMP_UEQ expansion - FCMP_ONE should be the same. | |||
| 3093 | if (CondTy && !ST->hasAVX()) | |||
| 3094 | return getCmpSelInstrCost(Opcode, ValTy, CondTy, | |||
| 3095 | CmpInst::Predicate::FCMP_UNO, CostKind) + | |||
| 3096 | getCmpSelInstrCost(Opcode, ValTy, CondTy, | |||
| 3097 | CmpInst::Predicate::FCMP_OEQ, CostKind) + | |||
| 3098 | getArithmeticInstrCost(Instruction::Or, CondTy, CostKind); | |||
| 3099 | ||||
| 3100 | break; | |||
| 3101 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: | |||
| 3102 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: | |||
| 3103 | // Assume worst case scenario and add the maximum extra cost. | |||
| 3104 | ExtraCost = 3; | |||
| 3105 | break; | |||
| 3106 | default: | |||
| 3107 | break; | |||
| 3108 | } | |||
| 3109 | } | |||
| 3110 | } | |||
| 3111 | ||||
| 3112 | static const CostKindTblEntry SLMCostTbl[] = { | |||
| 3113 | // slm pcmpeq/pcmpgt throughput is 2 | |||
| 3114 | { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } }, | |||
| 3115 | // slm pblendvb/blendvpd/blendvps throughput is 4 | |||
| 3116 | { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd | |||
| 3117 | { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps | |||
| 3118 | { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb | |||
| 3119 | { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb | |||
| 3120 | { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb | |||
| 3121 | { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb | |||
| 3122 | }; | |||
| 3123 | ||||
| 3124 | static const CostKindTblEntry AVX512BWCostTbl[] = { | |||
| 3125 | { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3126 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 3127 | { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3128 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
| 3129 | ||||
| 3130 | { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3131 | { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3132 | }; | |||
| 3133 | ||||
| 3134 | static const CostKindTblEntry AVX512CostTbl[] = { | |||
| 3135 | { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } }, | |||
| 3136 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } }, | |||
| 3137 | { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } }, | |||
| 3138 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } }, | |||
| 3139 | ||||
| 3140 | { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 3141 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 3142 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3143 | { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3144 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 3145 | { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
| 3146 | { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
| 3147 | ||||
| 3148 | { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 3149 | { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 3150 | { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3151 | { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3152 | { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 3153 | { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3154 | { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } }, | |||
| 3155 | { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } }, | |||
| 3156 | { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } }, | |||
| 3157 | { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } }, | |||
| 3158 | { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } }, | |||
| 3159 | { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } }, | |||
| 3160 | { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } }, | |||
| 3161 | { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } }, | |||
| 3162 | ||||
| 3163 | { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } }, | |||
| 3164 | { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 3165 | { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3166 | { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } }, | |||
| 3167 | { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
| 3168 | { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3169 | }; | |||
| 3170 | ||||
| 3171 | static const CostKindTblEntry AVX2CostTbl[] = { | |||
| 3172 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } }, | |||
| 3173 | { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } }, | |||
| 3174 | { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } }, | |||
| 3175 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } }, | |||
| 3176 | { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } }, | |||
| 3177 | { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } }, | |||
| 3178 | ||||
| 3179 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } }, | |||
| 3180 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
| 3181 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
| 3182 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
| 3183 | ||||
| 3184 | { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd | |||
| 3185 | { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps | |||
| 3186 | { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3187 | { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3188 | { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3189 | { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3190 | }; | |||
| 3191 | ||||
| 3192 | static const CostKindTblEntry XOPCostTbl[] = { | |||
| 3193 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, | |||
| 3194 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3195 | }; | |||
| 3196 | ||||
| 3197 | static const CostKindTblEntry AVX1CostTbl[] = { | |||
| 3198 | { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } }, | |||
| 3199 | { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } }, | |||
| 3200 | { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } }, | |||
| 3201 | { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } }, | |||
| 3202 | { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } }, | |||
| 3203 | { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } }, | |||
| 3204 | ||||
| 3205 | // AVX1 does not support 8-wide integer compare. | |||
| 3206 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, | |||
| 3207 | { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } }, | |||
| 3208 | { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } }, | |||
| 3209 | { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } }, | |||
| 3210 | ||||
| 3211 | { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd | |||
| 3212 | { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps | |||
| 3213 | { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd | |||
| 3214 | { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps | |||
| 3215 | { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps | |||
| 3216 | { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps | |||
| 3217 | }; | |||
| 3218 | ||||
| 3219 | static const CostKindTblEntry SSE42CostTbl[] = { | |||
| 3220 | { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } }, | |||
| 3221 | }; | |||
| 3222 | ||||
| 3223 | static const CostKindTblEntry SSE41CostTbl[] = { | |||
| 3224 | { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } }, | |||
| 3225 | { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } }, | |||
| 3226 | ||||
| 3227 | { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd | |||
| 3228 | { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd | |||
| 3229 | { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps | |||
| 3230 | { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps | |||
| 3231 | { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3232 | { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3233 | { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3234 | { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb | |||
| 3235 | }; | |||
| 3236 | ||||
| 3237 | static const CostKindTblEntry SSE2CostTbl[] = { | |||
| 3238 | { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } }, | |||
| 3239 | { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } }, | |||
| 3240 | ||||
| 3241 | { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion | |||
| 3242 | { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3243 | { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3244 | { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3245 | ||||
| 3246 | { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd | |||
| 3247 | { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd | |||
| 3248 | { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
| 3249 | { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
| 3250 | { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
| 3251 | { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por | |||
| 3252 | }; | |||
| 3253 | ||||
| 3254 | static const CostKindTblEntry SSE1CostTbl[] = { | |||
| 3255 | { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } }, | |||
| 3256 | { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } }, | |||
| 3257 | ||||
| 3258 | { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps | |||
| 3259 | { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps | |||
| 3260 | }; | |||
| 3261 | ||||
| 3262 | if (ST->useSLMArithCosts()) | |||
| 3263 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
| 3264 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3265 | return LT.first * (ExtraCost + *KindCost); | |||
| 3266 | ||||
| 3267 | if (ST->hasBWI()) | |||
| 3268 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
| 3269 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3270 | return LT.first * (ExtraCost + *KindCost); | |||
| 3271 | ||||
| 3272 | if (ST->hasAVX512()) | |||
| 3273 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
| 3274 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3275 | return LT.first * (ExtraCost + *KindCost); | |||
| 3276 | ||||
| 3277 | if (ST->hasAVX2()) | |||
| 3278 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
| 3279 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3280 | return LT.first * (ExtraCost + *KindCost); | |||
| 3281 | ||||
| 3282 | if (ST->hasXOP()) | |||
| 3283 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
| 3284 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3285 | return LT.first * (ExtraCost + *KindCost); | |||
| 3286 | ||||
| 3287 | if (ST->hasAVX()) | |||
| 3288 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
| 3289 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3290 | return LT.first * (ExtraCost + *KindCost); | |||
| 3291 | ||||
| 3292 | if (ST->hasSSE42()) | |||
| 3293 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
| 3294 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3295 | return LT.first * (ExtraCost + *KindCost); | |||
| 3296 | ||||
| 3297 | if (ST->hasSSE41()) | |||
| 3298 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
| 3299 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3300 | return LT.first * (ExtraCost + *KindCost); | |||
| 3301 | ||||
| 3302 | if (ST->hasSSE2()) | |||
| 3303 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
| 3304 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3305 | return LT.first * (ExtraCost + *KindCost); | |||
| 3306 | ||||
| 3307 | if (ST->hasSSE1()) | |||
| 3308 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
| 3309 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 3310 | return LT.first * (ExtraCost + *KindCost); | |||
| 3311 | ||||
| 3312 | // Assume a 3cy latency for fp select ops. | |||
| 3313 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) | |||
| 3314 | if (ValTy->getScalarType()->isFloatingPointTy()) | |||
| 3315 | return 3; | |||
| 3316 | ||||
| 3317 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); | |||
| 3318 | } | |||
| 3319 | ||||
| 3320 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } | |||
| 3321 | ||||
| 3322 | InstructionCost | |||
| 3323 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, | |||
| 3324 | TTI::TargetCostKind CostKind) { | |||
| 3325 | // Costs should match the codegen from: | |||
| 3326 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll | |||
| 3327 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll | |||
| 3328 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll | |||
| 3329 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll | |||
| 3330 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll | |||
| 3331 | ||||
| 3332 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not | |||
| 3333 | // specialized in these tables yet. | |||
| 3334 | static const CostKindTblEntry AVX512VBMI2CostTbl[] = { | |||
| 3335 | { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 3336 | { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 3337 | { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3338 | { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3339 | { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 3340 | { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3341 | { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3342 | { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 3343 | { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3344 | { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3345 | { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 3346 | { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3347 | { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3348 | { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 3349 | { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3350 | }; | |||
| 3351 | static const CostKindTblEntry AVX512BITALGCostTbl[] = { | |||
| 3352 | { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3353 | { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3354 | { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 3355 | { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
| 3356 | { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3357 | { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3358 | }; | |||
| 3359 | static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { | |||
| 3360 | { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 3361 | { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3362 | { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 3363 | { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 3364 | { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3365 | { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3366 | }; | |||
| 3367 | static const CostKindTblEntry AVX512CDCostTbl[] = { | |||
| 3368 | { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } }, | |||
| 3369 | { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } }, | |||
| 3370 | { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } }, | |||
| 3371 | { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } }, | |||
| 3372 | { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } }, | |||
| 3373 | { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } }, | |||
| 3374 | { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } }, | |||
| 3375 | { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } }, | |||
| 3376 | { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } }, | |||
| 3377 | { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } }, | |||
| 3378 | { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } }, | |||
| 3379 | { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } }, | |||
| 3380 | ||||
| 3381 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, | |||
| 3382 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, | |||
| 3383 | { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } }, | |||
| 3384 | { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } }, | |||
| 3385 | { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } }, | |||
| 3386 | { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } }, | |||
| 3387 | }; | |||
| 3388 | static const CostKindTblEntry AVX512BWCostTbl[] = { | |||
| 3389 | { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3390 | { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3391 | { ISD::BITREVERSE, MVT::v8i64, { 3 } }, | |||
| 3392 | { ISD::BITREVERSE, MVT::v16i32, { 3 } }, | |||
| 3393 | { ISD::BITREVERSE, MVT::v32i16, { 3 } }, | |||
| 3394 | { ISD::BITREVERSE, MVT::v64i8, { 2 } }, | |||
| 3395 | { ISD::BSWAP, MVT::v8i64, { 1 } }, | |||
| 3396 | { ISD::BSWAP, MVT::v16i32, { 1 } }, | |||
| 3397 | { ISD::BSWAP, MVT::v32i16, { 1 } }, | |||
| 3398 | { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } }, | |||
| 3399 | { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } }, | |||
| 3400 | { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } }, | |||
| 3401 | { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } }, | |||
| 3402 | { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } }, | |||
| 3403 | { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } }, | |||
| 3404 | { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } }, | |||
| 3405 | { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } }, | |||
| 3406 | { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } }, | |||
| 3407 | { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } }, | |||
| 3408 | { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } }, | |||
| 3409 | { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } }, | |||
| 3410 | { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } }, | |||
| 3411 | { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } }, | |||
| 3412 | { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } }, | |||
| 3413 | { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } }, | |||
| 3414 | { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } }, | |||
| 3415 | { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } }, | |||
| 3416 | { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } }, | |||
| 3417 | { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } }, | |||
| 3418 | { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } }, | |||
| 3419 | { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } }, | |||
| 3420 | { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } }, | |||
| 3421 | { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } }, | |||
| 3422 | { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } }, | |||
| 3423 | { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } }, | |||
| 3424 | { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } }, | |||
| 3425 | { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } }, | |||
| 3426 | { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } }, | |||
| 3427 | { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } }, | |||
| 3428 | { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } }, | |||
| 3429 | { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, | |||
| 3430 | { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, | |||
| 3431 | { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, | |||
| 3432 | { ISD::SADDSAT, MVT::v32i16, { 1 } }, | |||
| 3433 | { ISD::SADDSAT, MVT::v64i8, { 1 } }, | |||
| 3434 | { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3435 | { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3436 | { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3437 | { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3438 | { ISD::SSUBSAT, MVT::v32i16, { 1 } }, | |||
| 3439 | { ISD::SSUBSAT, MVT::v64i8, { 1 } }, | |||
| 3440 | { ISD::UADDSAT, MVT::v32i16, { 1 } }, | |||
| 3441 | { ISD::UADDSAT, MVT::v64i8, { 1 } }, | |||
| 3442 | { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3443 | { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3444 | { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } }, | |||
| 3445 | { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } }, | |||
| 3446 | { ISD::USUBSAT, MVT::v32i16, { 1 } }, | |||
| 3447 | { ISD::USUBSAT, MVT::v64i8, { 1 } }, | |||
| 3448 | }; | |||
| 3449 | static const CostKindTblEntry AVX512CostTbl[] = { | |||
| 3450 | { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 3451 | { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 3452 | { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3453 | { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3454 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 3455 | { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } }, | |||
| 3456 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } }, | |||
| 3457 | { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } }, | |||
| 3458 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } }, | |||
| 3459 | { ISD::BITREVERSE, MVT::v8i64, { 36 } }, | |||
| 3460 | { ISD::BITREVERSE, MVT::v16i32, { 24 } }, | |||
| 3461 | { ISD::BITREVERSE, MVT::v32i16, { 10 } }, | |||
| 3462 | { ISD::BITREVERSE, MVT::v64i8, { 10 } }, | |||
| 3463 | { ISD::BSWAP, MVT::v8i64, { 4 } }, | |||
| 3464 | { ISD::BSWAP, MVT::v16i32, { 4 } }, | |||
| 3465 | { ISD::BSWAP, MVT::v32i16, { 4 } }, | |||
| 3466 | { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } }, | |||
| 3467 | { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } }, | |||
| 3468 | { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } }, | |||
| 3469 | { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } }, | |||
| 3470 | { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } }, | |||
| 3471 | { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } }, | |||
| 3472 | { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } }, | |||
| 3473 | { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } }, | |||
| 3474 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, | |||
| 3475 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, | |||
| 3476 | { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } }, | |||
| 3477 | { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } }, | |||
| 3478 | { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 3479 | { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 3480 | { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3481 | { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3482 | { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 3483 | { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3484 | { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } }, | |||
| 3485 | { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } }, | |||
| 3486 | { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } }, | |||
| 3487 | { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3488 | { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, | |||
| 3489 | { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3490 | { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
| 3491 | { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3492 | { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
| 3493 | { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
| 3494 | { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
| 3495 | { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
| 3496 | { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
| 3497 | { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3498 | { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
| 3499 | { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
| 3500 | { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
| 3501 | { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
| 3502 | { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
| 3503 | { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3504 | { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
| 3505 | { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
| 3506 | { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
| 3507 | { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
| 3508 | { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } }, | |||
| 3509 | { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } }, | |||
| 3510 | { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } }, | |||
| 3511 | { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } }, | |||
| 3512 | { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } }, | |||
| 3513 | { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
| 3514 | { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd | |||
| 3515 | { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq | |||
| 3516 | { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq | |||
| 3517 | { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq | |||
| 3518 | { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd | |||
| 3519 | { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq | |||
| 3520 | { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq | |||
| 3521 | { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq | |||
| 3522 | { ISD::SADDSAT, MVT::v32i16, { 2 } }, | |||
| 3523 | { ISD::SADDSAT, MVT::v64i8, { 2 } }, | |||
| 3524 | { ISD::SSUBSAT, MVT::v32i16, { 2 } }, | |||
| 3525 | { ISD::SSUBSAT, MVT::v64i8, { 2 } }, | |||
| 3526 | { ISD::UADDSAT, MVT::v32i16, { 2 } }, | |||
| 3527 | { ISD::UADDSAT, MVT::v64i8, { 2 } }, | |||
| 3528 | { ISD::USUBSAT, MVT::v32i16, { 2 } }, | |||
| 3529 | { ISD::USUBSAT, MVT::v64i8, { 2 } }, | |||
| 3530 | { ISD::FMAXNUM, MVT::f32, { 2 } }, | |||
| 3531 | { ISD::FMAXNUM, MVT::v4f32, { 2 } }, | |||
| 3532 | { ISD::FMAXNUM, MVT::v8f32, { 2 } }, | |||
| 3533 | { ISD::FMAXNUM, MVT::v16f32, { 2 } }, | |||
| 3534 | { ISD::FMAXNUM, MVT::f64, { 2 } }, | |||
| 3535 | { ISD::FMAXNUM, MVT::v2f64, { 2 } }, | |||
| 3536 | { ISD::FMAXNUM, MVT::v4f64, { 2 } }, | |||
| 3537 | { ISD::FMAXNUM, MVT::v8f64, { 2 } }, | |||
| 3538 | { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 3539 | { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 3540 | { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 3541 | { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
| 3542 | { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 3543 | { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 3544 | { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/ | |||
| 3545 | { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/ | |||
| 3546 | }; | |||
| 3547 | static const CostKindTblEntry XOPCostTbl[] = { | |||
| 3548 | { ISD::BITREVERSE, MVT::v4i64, { 4 } }, | |||
| 3549 | { ISD::BITREVERSE, MVT::v8i32, { 4 } }, | |||
| 3550 | { ISD::BITREVERSE, MVT::v16i16, { 4 } }, | |||
| 3551 | { ISD::BITREVERSE, MVT::v32i8, { 4 } }, | |||
| 3552 | { ISD::BITREVERSE, MVT::v2i64, { 1 } }, | |||
| 3553 | { ISD::BITREVERSE, MVT::v4i32, { 1 } }, | |||
| 3554 | { ISD::BITREVERSE, MVT::v8i16, { 1 } }, | |||
| 3555 | { ISD::BITREVERSE, MVT::v16i8, { 1 } }, | |||
| 3556 | { ISD::BITREVERSE, MVT::i64, { 3 } }, | |||
| 3557 | { ISD::BITREVERSE, MVT::i32, { 3 } }, | |||
| 3558 | { ISD::BITREVERSE, MVT::i16, { 3 } }, | |||
| 3559 | { ISD::BITREVERSE, MVT::i8, { 3 } }, | |||
| 3560 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) | |||
| 3561 | { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } }, | |||
| 3562 | { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } }, | |||
| 3563 | { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } }, | |||
| 3564 | { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } }, | |||
| 3565 | { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } }, | |||
| 3566 | { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } }, | |||
| 3567 | { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } }, | |||
| 3568 | { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } }, | |||
| 3569 | { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } }, | |||
| 3570 | { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } }, | |||
| 3571 | { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } }, | |||
| 3572 | { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } }, | |||
| 3573 | { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, | |||
| 3574 | { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, | |||
| 3575 | { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, | |||
| 3576 | { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } } | |||
| 3577 | }; | |||
| 3578 | static const CostKindTblEntry AVX2CostTbl[] = { | |||
| 3579 | { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
| 3580 | { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
| 3581 | { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3582 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
| 3583 | { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3584 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
| 3585 | { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3586 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
| 3587 | { ISD::BITREVERSE, MVT::v2i64, { 3 } }, | |||
| 3588 | { ISD::BITREVERSE, MVT::v4i64, { 3 } }, | |||
| 3589 | { ISD::BITREVERSE, MVT::v4i32, { 3 } }, | |||
| 3590 | { ISD::BITREVERSE, MVT::v8i32, { 3 } }, | |||
| 3591 | { ISD::BITREVERSE, MVT::v8i16, { 3 } }, | |||
| 3592 | { ISD::BITREVERSE, MVT::v16i16, { 3 } }, | |||
| 3593 | { ISD::BITREVERSE, MVT::v16i8, { 3 } }, | |||
| 3594 | { ISD::BITREVERSE, MVT::v32i8, { 3 } }, | |||
| 3595 | { ISD::BSWAP, MVT::v4i64, { 1 } }, | |||
| 3596 | { ISD::BSWAP, MVT::v8i32, { 1 } }, | |||
| 3597 | { ISD::BSWAP, MVT::v16i16, { 1 } }, | |||
| 3598 | { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } }, | |||
| 3599 | { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } }, | |||
| 3600 | { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } }, | |||
| 3601 | { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } }, | |||
| 3602 | { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } }, | |||
| 3603 | { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } }, | |||
| 3604 | { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } }, | |||
| 3605 | { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } }, | |||
| 3606 | { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } }, | |||
| 3607 | { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } }, | |||
| 3608 | { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } }, | |||
| 3609 | { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } }, | |||
| 3610 | { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } }, | |||
| 3611 | { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } }, | |||
| 3612 | { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } }, | |||
| 3613 | { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } }, | |||
| 3614 | { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } }, | |||
| 3615 | { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } }, | |||
| 3616 | { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } }, | |||
| 3617 | { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } }, | |||
| 3618 | { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } }, | |||
| 3619 | { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } }, | |||
| 3620 | { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } }, | |||
| 3621 | { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } }, | |||
| 3622 | { ISD::SADDSAT, MVT::v16i16, { 1 } }, | |||
| 3623 | { ISD::SADDSAT, MVT::v32i8, { 1 } }, | |||
| 3624 | { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } }, | |||
| 3625 | { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } }, | |||
| 3626 | { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
| 3627 | { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
| 3628 | { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
| 3629 | { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } }, | |||
| 3630 | { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } }, | |||
| 3631 | { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
| 3632 | { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
| 3633 | { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
| 3634 | { ISD::SSUBSAT, MVT::v16i16, { 1 } }, | |||
| 3635 | { ISD::SSUBSAT, MVT::v32i8, { 1 } }, | |||
| 3636 | { ISD::UADDSAT, MVT::v16i16, { 1 } }, | |||
| 3637 | { ISD::UADDSAT, MVT::v32i8, { 1 } }, | |||
| 3638 | { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd | |||
| 3639 | { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } }, | |||
| 3640 | { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } }, | |||
| 3641 | { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
| 3642 | { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
| 3643 | { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
| 3644 | { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } }, | |||
| 3645 | { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } }, | |||
| 3646 | { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } }, | |||
| 3647 | { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } }, | |||
| 3648 | { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } }, | |||
| 3649 | { ISD::USUBSAT, MVT::v16i16, { 1 } }, | |||
| 3650 | { ISD::USUBSAT, MVT::v32i8, { 1 } }, | |||
| 3651 | { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd | |||
| 3652 | { ISD::FMAXNUM, MVT::v8f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
| 3653 | { ISD::FMAXNUM, MVT::v4f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
| 3654 | { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss | |||
| 3655 | { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps | |||
| 3656 | { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps | |||
| 3657 | { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd | |||
| 3658 | { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd | |||
| 3659 | { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd | |||
| 3660 | }; | |||
| 3661 | static const CostKindTblEntry AVX1CostTbl[] = { | |||
| 3662 | { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) | |||
| 3663 | { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } }, | |||
| 3664 | { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } }, | |||
| 3665 | { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } }, | |||
| 3666 | { ISD::BITREVERSE, MVT::v4i64, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
| 3667 | { ISD::BITREVERSE, MVT::v8i32, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
| 3668 | { ISD::BITREVERSE, MVT::v16i16, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
| 3669 | { ISD::BITREVERSE, MVT::v32i8, { 12 } }, // 2 x 128-bit Op + extract/insert | |||
| 3670 | { ISD::BSWAP, MVT::v4i64, { 4 } }, | |||
| 3671 | { ISD::BSWAP, MVT::v8i32, { 4 } }, | |||
| 3672 | { ISD::BSWAP, MVT::v16i16, { 4 } }, | |||
| 3673 | { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert | |||
| 3674 | { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } }, | |||
| 3675 | { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert | |||
| 3676 | { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } }, | |||
| 3677 | { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert | |||
| 3678 | { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } }, | |||
| 3679 | { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert | |||
| 3680 | { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } }, | |||
| 3681 | { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert | |||
| 3682 | { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } }, | |||
| 3683 | { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert | |||
| 3684 | { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } }, | |||
| 3685 | { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert | |||
| 3686 | { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } }, | |||
| 3687 | { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert | |||
| 3688 | { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } }, | |||
| 3689 | { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert | |||
| 3690 | { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } }, | |||
| 3691 | { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert | |||
| 3692 | { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } }, | |||
| 3693 | { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert | |||
| 3694 | { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } }, | |||
| 3695 | { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert | |||
| 3696 | { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } }, | |||
| 3697 | { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3698 | { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3699 | { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert | |||
| 3700 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } }, | |||
| 3701 | { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3702 | { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3703 | { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3704 | { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert | |||
| 3705 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, | |||
| 3706 | { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3707 | { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3708 | { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3709 | { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3710 | { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3711 | { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3712 | { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3713 | { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert | |||
| 3714 | { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert | |||
| 3715 | { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } }, | |||
| 3716 | { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3717 | { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3718 | { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3719 | { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert | |||
| 3720 | { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } }, | |||
| 3721 | { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3722 | { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3723 | { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3724 | { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3725 | { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert | |||
| 3726 | { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert | |||
| 3727 | { ISD::FMAXNUM, MVT::f32, { 3 } }, // MAXSS + CMPUNORDSS + BLENDVPS | |||
| 3728 | { ISD::FMAXNUM, MVT::v4f32, { 3 } }, // MAXPS + CMPUNORDPS + BLENDVPS | |||
| 3729 | { ISD::FMAXNUM, MVT::v8f32, { 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS + ? | |||
| 3730 | { ISD::FMAXNUM, MVT::f64, { 3 } }, // MAXSD + CMPUNORDSD + BLENDVPD | |||
| 3731 | { ISD::FMAXNUM, MVT::v2f64, { 3 } }, // MAXPD + CMPUNORDPD + BLENDVPD | |||
| 3732 | { ISD::FMAXNUM, MVT::v4f64, { 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD + ? | |||
| 3733 | { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss | |||
| 3734 | { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps | |||
| 3735 | { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps | |||
| 3736 | { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd | |||
| 3737 | { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd | |||
| 3738 | { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd | |||
| 3739 | }; | |||
| 3740 | static const CostKindTblEntry GLMCostTbl[] = { | |||
| 3741 | { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss | |||
| 3742 | { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps | |||
| 3743 | { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd | |||
| 3744 | { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd | |||
| 3745 | }; | |||
| 3746 | static const CostKindTblEntry SLMCostTbl[] = { | |||
| 3747 | { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss | |||
| 3748 | { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps | |||
| 3749 | { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd | |||
| 3750 | { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd | |||
| 3751 | }; | |||
| 3752 | static const CostKindTblEntry SSE42CostTbl[] = { | |||
| 3753 | { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd | |||
| 3754 | { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd | |||
| 3755 | { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 3756 | { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 3757 | }; | |||
| 3758 | static const CostKindTblEntry SSE41CostTbl[] = { | |||
| 3759 | { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) | |||
| 3760 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } }, | |||
| 3761 | { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3762 | { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3763 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, | |||
| 3764 | { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3765 | { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3766 | { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } }, | |||
| 3767 | { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3768 | { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3769 | { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } }, | |||
| 3770 | { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } }, | |||
| 3771 | { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3772 | }; | |||
| 3773 | static const CostKindTblEntry SSSE3CostTbl[] = { | |||
| 3774 | { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } }, | |||
| 3775 | { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } }, | |||
| 3776 | { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } }, | |||
| 3777 | { ISD::BITREVERSE, MVT::v2i64, { 5 } }, | |||
| 3778 | { ISD::BITREVERSE, MVT::v4i32, { 5 } }, | |||
| 3779 | { ISD::BITREVERSE, MVT::v8i16, { 5 } }, | |||
| 3780 | { ISD::BITREVERSE, MVT::v16i8, { 5 } }, | |||
| 3781 | { ISD::BSWAP, MVT::v2i64, { 1 } }, | |||
| 3782 | { ISD::BSWAP, MVT::v4i32, { 1 } }, | |||
| 3783 | { ISD::BSWAP, MVT::v8i16, { 1 } }, | |||
| 3784 | { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } }, | |||
| 3785 | { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } }, | |||
| 3786 | { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } }, | |||
| 3787 | { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } }, | |||
| 3788 | { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } }, | |||
| 3789 | { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } }, | |||
| 3790 | { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } }, | |||
| 3791 | { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } }, | |||
| 3792 | { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } }, | |||
| 3793 | { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } }, | |||
| 3794 | { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } }, | |||
| 3795 | { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } } | |||
| 3796 | }; | |||
| 3797 | static const CostKindTblEntry SSE2CostTbl[] = { | |||
| 3798 | { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } }, | |||
| 3799 | { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } }, | |||
| 3800 | { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } }, | |||
| 3801 | { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } }, | |||
| 3802 | { ISD::BITREVERSE, MVT::v2i64, { 29 } }, | |||
| 3803 | { ISD::BITREVERSE, MVT::v4i32, { 27 } }, | |||
| 3804 | { ISD::BITREVERSE, MVT::v8i16, { 27 } }, | |||
| 3805 | { ISD::BITREVERSE, MVT::v16i8, { 20 } }, | |||
| 3806 | { ISD::BSWAP, MVT::v2i64, { 7 } }, | |||
| 3807 | { ISD::BSWAP, MVT::v4i32, { 7 } }, | |||
| 3808 | { ISD::BSWAP, MVT::v8i16, { 7 } }, | |||
| 3809 | { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } }, | |||
| 3810 | { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } }, | |||
| 3811 | { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } }, | |||
| 3812 | { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } }, | |||
| 3813 | { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } }, | |||
| 3814 | { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } }, | |||
| 3815 | { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } }, | |||
| 3816 | { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } }, | |||
| 3817 | { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } }, | |||
| 3818 | { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } }, | |||
| 3819 | { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } }, | |||
| 3820 | { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } }, | |||
| 3821 | { ISD::SADDSAT, MVT::v8i16, { 1 } }, | |||
| 3822 | { ISD::SADDSAT, MVT::v16i8, { 1 } }, | |||
| 3823 | { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
| 3824 | { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } }, | |||
| 3825 | { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3826 | { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } }, | |||
| 3827 | { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
| 3828 | { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } }, | |||
| 3829 | { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } }, | |||
| 3830 | { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } }, | |||
| 3831 | { ISD::SSUBSAT, MVT::v8i16, { 1 } }, | |||
| 3832 | { ISD::SSUBSAT, MVT::v16i8, { 1 } }, | |||
| 3833 | { ISD::UADDSAT, MVT::v8i16, { 1 } }, | |||
| 3834 | { ISD::UADDSAT, MVT::v16i8, { 1 } }, | |||
| 3835 | { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
| 3836 | { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } }, | |||
| 3837 | { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } }, | |||
| 3838 | { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3839 | { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } }, | |||
| 3840 | { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } }, | |||
| 3841 | { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } }, | |||
| 3842 | { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } }, | |||
| 3843 | { ISD::USUBSAT, MVT::v8i16, { 1 } }, | |||
| 3844 | { ISD::USUBSAT, MVT::v16i8, { 1 } }, | |||
| 3845 | { ISD::FMAXNUM, MVT::f64, { 4 } }, | |||
| 3846 | { ISD::FMAXNUM, MVT::v2f64, { 4 } }, | |||
| 3847 | { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 3848 | { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ | |||
| 3849 | }; | |||
| 3850 | static const CostKindTblEntry SSE1CostTbl[] = { | |||
| 3851 | { ISD::FMAXNUM, MVT::f32, { 4 } }, | |||
| 3852 | { ISD::FMAXNUM, MVT::v4f32, { 4 } }, | |||
| 3853 | { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
| 3854 | { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/ | |||
| 3855 | }; | |||
| 3856 | static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets | |||
| 3857 | { ISD::CTTZ, MVT::i64, { 1 } }, | |||
| 3858 | }; | |||
| 3859 | static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets | |||
| 3860 | { ISD::CTTZ, MVT::i32, { 1 } }, | |||
| 3861 | { ISD::CTTZ, MVT::i16, { 1 } }, | |||
| 3862 | { ISD::CTTZ, MVT::i8, { 1 } }, | |||
| 3863 | }; | |||
| 3864 | static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets | |||
| 3865 | { ISD::CTLZ, MVT::i64, { 1 } }, | |||
| 3866 | }; | |||
| 3867 | static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets | |||
| 3868 | { ISD::CTLZ, MVT::i32, { 1 } }, | |||
| 3869 | { ISD::CTLZ, MVT::i16, { 2 } }, | |||
| 3870 | { ISD::CTLZ, MVT::i8, { 2 } }, | |||
| 3871 | }; | |||
| 3872 | static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets | |||
| 3873 | { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt | |||
| 3874 | }; | |||
| 3875 | static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets | |||
| 3876 | { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt | |||
| 3877 | { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext()) | |||
| 3878 | { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) | |||
| 3879 | }; | |||
| 3880 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets | |||
| 3881 | { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV | |||
| 3882 | { ISD::BITREVERSE, MVT::i64, { 14 } }, | |||
| 3883 | { ISD::BSWAP, MVT::i64, { 1 } }, | |||
| 3884 | { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
| 3885 | { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR | |||
| 3886 | { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
| 3887 | { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR | |||
| 3888 | { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, | |||
| 3889 | { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, | |||
| 3890 | { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, | |||
| 3891 | { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } }, | |||
| 3892 | { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } }, | |||
| 3893 | { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } }, | |||
| 3894 | { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } }, | |||
| 3895 | { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } }, | |||
| 3896 | { ISD::SADDO, MVT::i64, { 1 } }, | |||
| 3897 | { ISD::UADDO, MVT::i64, { 1 } }, | |||
| 3898 | { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto | |||
| 3899 | }; | |||
| 3900 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets | |||
| 3901 | { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV | |||
| 3902 | { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV | |||
| 3903 | { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA | |||
| 3904 | { ISD::BITREVERSE, MVT::i32, { 14 } }, | |||
| 3905 | { ISD::BITREVERSE, MVT::i16, { 14 } }, | |||
| 3906 | { ISD::BITREVERSE, MVT::i8, { 11 } }, | |||
| 3907 | { ISD::BSWAP, MVT::i32, { 1 } }, | |||
| 3908 | { ISD::BSWAP, MVT::i16, { 1 } }, // ROL | |||
| 3909 | { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
| 3910 | { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
| 3911 | { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV | |||
| 3912 | { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR | |||
| 3913 | { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR | |||
| 3914 | { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR | |||
| 3915 | { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
| 3916 | { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
| 3917 | { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH | |||
| 3918 | { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF | |||
| 3919 | { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF | |||
| 3920 | { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF | |||
| 3921 | { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, | |||
| 3922 | { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, | |||
| 3923 | { ISD::CTPOP, MVT::i8, { 7, 6, 13, 13 } }, | |||
| 3924 | { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, | |||
| 3925 | { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, | |||
| 3926 | { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, | |||
| 3927 | { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } }, | |||
| 3928 | { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } }, | |||
| 3929 | { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } }, | |||
| 3930 | { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } }, | |||
| 3931 | { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } }, | |||
| 3932 | { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } }, | |||
| 3933 | { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } }, | |||
| 3934 | { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } }, | |||
| 3935 | { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } }, | |||
| 3936 | { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } }, | |||
| 3937 | { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } }, | |||
| 3938 | { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } }, | |||
| 3939 | { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } }, | |||
| 3940 | { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } }, | |||
| 3941 | { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } }, | |||
| 3942 | { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } }, | |||
| 3943 | { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } }, | |||
| 3944 | { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } }, | |||
| 3945 | { ISD::SADDO, MVT::i32, { 1 } }, | |||
| 3946 | { ISD::SADDO, MVT::i16, { 1 } }, | |||
| 3947 | { ISD::SADDO, MVT::i8, { 1 } }, | |||
| 3948 | { ISD::UADDO, MVT::i32, { 1 } }, | |||
| 3949 | { ISD::UADDO, MVT::i16, { 1 } }, | |||
| 3950 | { ISD::UADDO, MVT::i8, { 1 } }, | |||
| 3951 | { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto | |||
| 3952 | { ISD::UMULO, MVT::i16, { 2 } }, | |||
| 3953 | { ISD::UMULO, MVT::i8, { 2 } }, | |||
| 3954 | }; | |||
| 3955 | ||||
| 3956 | Type *RetTy = ICA.getReturnType(); | |||
| 3957 | Type *OpTy = RetTy; | |||
| 3958 | Intrinsic::ID IID = ICA.getID(); | |||
| 3959 | unsigned ISD = ISD::DELETED_NODE; | |||
| 3960 | switch (IID) { | |||
| 3961 | default: | |||
| 3962 | break; | |||
| 3963 | case Intrinsic::abs: | |||
| 3964 | ISD = ISD::ABS; | |||
| 3965 | break; | |||
| 3966 | case Intrinsic::bitreverse: | |||
| 3967 | ISD = ISD::BITREVERSE; | |||
| 3968 | break; | |||
| 3969 | case Intrinsic::bswap: | |||
| 3970 | ISD = ISD::BSWAP; | |||
| 3971 | break; | |||
| 3972 | case Intrinsic::ctlz: | |||
| 3973 | ISD = ISD::CTLZ; | |||
| 3974 | break; | |||
| 3975 | case Intrinsic::ctpop: | |||
| 3976 | ISD = ISD::CTPOP; | |||
| 3977 | break; | |||
| 3978 | case Intrinsic::cttz: | |||
| 3979 | ISD = ISD::CTTZ; | |||
| 3980 | break; | |||
| 3981 | case Intrinsic::fshl: | |||
| 3982 | ISD = ISD::FSHL; | |||
| 3983 | if (!ICA.isTypeBasedOnly()) { | |||
| 3984 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
| 3985 | if (Args[0] == Args[1]) | |||
| 3986 | ISD = ISD::ROTL; | |||
| 3987 | } | |||
| 3988 | break; | |||
| 3989 | case Intrinsic::fshr: | |||
| 3990 | // FSHR has same costs so don't duplicate. | |||
| 3991 | ISD = ISD::FSHL; | |||
| 3992 | if (!ICA.isTypeBasedOnly()) { | |||
| 3993 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
| 3994 | if (Args[0] == Args[1]) | |||
| 3995 | ISD = ISD::ROTR; | |||
| 3996 | } | |||
| 3997 | break; | |||
| 3998 | case Intrinsic::maxnum: | |||
| 3999 | case Intrinsic::minnum: | |||
| 4000 | // FMINNUM has same costs so don't duplicate. | |||
| 4001 | ISD = ISD::FMAXNUM; | |||
| 4002 | break; | |||
| 4003 | case Intrinsic::sadd_sat: | |||
| 4004 | ISD = ISD::SADDSAT; | |||
| 4005 | break; | |||
| 4006 | case Intrinsic::smax: | |||
| 4007 | ISD = ISD::SMAX; | |||
| 4008 | break; | |||
| 4009 | case Intrinsic::smin: | |||
| 4010 | ISD = ISD::SMIN; | |||
| 4011 | break; | |||
| 4012 | case Intrinsic::ssub_sat: | |||
| 4013 | ISD = ISD::SSUBSAT; | |||
| 4014 | break; | |||
| 4015 | case Intrinsic::uadd_sat: | |||
| 4016 | ISD = ISD::UADDSAT; | |||
| 4017 | break; | |||
| 4018 | case Intrinsic::umax: | |||
| 4019 | ISD = ISD::UMAX; | |||
| 4020 | break; | |||
| 4021 | case Intrinsic::umin: | |||
| 4022 | ISD = ISD::UMIN; | |||
| 4023 | break; | |||
| 4024 | case Intrinsic::usub_sat: | |||
| 4025 | ISD = ISD::USUBSAT; | |||
| 4026 | break; | |||
| 4027 | case Intrinsic::sqrt: | |||
| 4028 | ISD = ISD::FSQRT; | |||
| 4029 | break; | |||
| 4030 | case Intrinsic::sadd_with_overflow: | |||
| 4031 | case Intrinsic::ssub_with_overflow: | |||
| 4032 | // SSUBO has same costs so don't duplicate. | |||
| 4033 | ISD = ISD::SADDO; | |||
| 4034 | OpTy = RetTy->getContainedType(0); | |||
| 4035 | break; | |||
| 4036 | case Intrinsic::uadd_with_overflow: | |||
| 4037 | case Intrinsic::usub_with_overflow: | |||
| 4038 | // USUBO has same costs so don't duplicate. | |||
| 4039 | ISD = ISD::UADDO; | |||
| 4040 | OpTy = RetTy->getContainedType(0); | |||
| 4041 | break; | |||
| 4042 | case Intrinsic::umul_with_overflow: | |||
| 4043 | case Intrinsic::smul_with_overflow: | |||
| 4044 | // SMULO has same costs so don't duplicate. | |||
| 4045 | ISD = ISD::UMULO; | |||
| 4046 | OpTy = RetTy->getContainedType(0); | |||
| 4047 | break; | |||
| 4048 | } | |||
| 4049 | ||||
| 4050 | if (ISD != ISD::DELETED_NODE) { | |||
| 4051 | // Legalize the type. | |||
| 4052 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy); | |||
| 4053 | MVT MTy = LT.second; | |||
| 4054 | ||||
| 4055 | // Attempt to lookup cost. | |||
| 4056 | if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() && | |||
| 4057 | MTy.isVector()) { | |||
| 4058 | // With PSHUFB the code is very similar for all types. If we have integer | |||
| 4059 | // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types | |||
| 4060 | // we also need a PSHUFB. | |||
| 4061 | unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2; | |||
| 4062 | ||||
| 4063 | // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB | |||
| 4064 | // instructions. We also need an extract and an insert. | |||
| 4065 | if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) || | |||
| 4066 | (ST->hasBWI() && MTy.is512BitVector()))) | |||
| 4067 | Cost = Cost * 2 + 2; | |||
| 4068 | ||||
| 4069 | return LT.first * Cost; | |||
| 4070 | } | |||
| 4071 | ||||
| 4072 | // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. | |||
| 4073 | if (((ISD == ISD::CTTZ && !ST->hasBMI()) || | |||
| 4074 | (ISD == ISD::CTLZ && !ST->hasLZCNT())) && | |||
| 4075 | !MTy.isVector() && !ICA.isTypeBasedOnly()) { | |||
| 4076 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); | |||
| 4077 | if (auto *Cst = dyn_cast<ConstantInt>(Args[1])) | |||
| 4078 | if (Cst->isAllOnesValue()) | |||
| 4079 | ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; | |||
| 4080 | } | |||
| 4081 | ||||
| 4082 | // FSQRT is a single instruction. | |||
| 4083 | if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) | |||
| 4084 | return LT.first; | |||
| 4085 | ||||
| 4086 | auto adjustTableCost = [](int ISD, unsigned Cost, | |||
| 4087 | InstructionCost LegalizationCost, | |||
| 4088 | FastMathFlags FMF) { | |||
| 4089 | // If there are no NANs to deal with, then these are reduced to a | |||
| 4090 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we | |||
| 4091 | // assume is used in the non-fast case. | |||
| 4092 | if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { | |||
| 4093 | if (FMF.noNaNs()) | |||
| 4094 | return LegalizationCost * 1; | |||
| 4095 | } | |||
| 4096 | return LegalizationCost * (int)Cost; | |||
| 4097 | }; | |||
| 4098 | ||||
| 4099 | if (ST->useGLMDivSqrtCosts()) | |||
| 4100 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) | |||
| 4101 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4102 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4103 | ICA.getFlags()); | |||
| 4104 | ||||
| 4105 | if (ST->useSLMArithCosts()) | |||
| 4106 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) | |||
| 4107 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4108 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4109 | ICA.getFlags()); | |||
| 4110 | ||||
| 4111 | if (ST->hasVBMI2()) | |||
| 4112 | if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) | |||
| 4113 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4114 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4115 | ICA.getFlags()); | |||
| 4116 | ||||
| 4117 | if (ST->hasBITALG()) | |||
| 4118 | if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) | |||
| 4119 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4120 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4121 | ICA.getFlags()); | |||
| 4122 | ||||
| 4123 | if (ST->hasVPOPCNTDQ()) | |||
| 4124 | if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) | |||
| 4125 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4126 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4127 | ICA.getFlags()); | |||
| 4128 | ||||
| 4129 | if (ST->hasCDI()) | |||
| 4130 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) | |||
| 4131 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4132 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4133 | ICA.getFlags()); | |||
| 4134 | ||||
| 4135 | if (ST->hasBWI()) | |||
| 4136 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
| 4137 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4138 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4139 | ICA.getFlags()); | |||
| 4140 | ||||
| 4141 | if (ST->hasAVX512()) | |||
| 4142 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
| 4143 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4144 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4145 | ICA.getFlags()); | |||
| 4146 | ||||
| 4147 | if (ST->hasXOP()) | |||
| 4148 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) | |||
| 4149 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4150 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4151 | ICA.getFlags()); | |||
| 4152 | ||||
| 4153 | if (ST->hasAVX2()) | |||
| 4154 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
| 4155 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4156 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4157 | ICA.getFlags()); | |||
| 4158 | ||||
| 4159 | if (ST->hasAVX()) | |||
| 4160 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
| 4161 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4162 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4163 | ICA.getFlags()); | |||
| 4164 | ||||
| 4165 | if (ST->hasSSE42()) | |||
| 4166 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
| 4167 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4168 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4169 | ICA.getFlags()); | |||
| 4170 | ||||
| 4171 | if (ST->hasSSE41()) | |||
| 4172 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
| 4173 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4174 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4175 | ICA.getFlags()); | |||
| 4176 | ||||
| 4177 | if (ST->hasSSSE3()) | |||
| 4178 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) | |||
| 4179 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4180 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4181 | ICA.getFlags()); | |||
| 4182 | ||||
| 4183 | if (ST->hasSSE2()) | |||
| 4184 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
| 4185 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4186 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4187 | ICA.getFlags()); | |||
| 4188 | ||||
| 4189 | if (ST->hasSSE1()) | |||
| 4190 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
| 4191 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4192 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4193 | ICA.getFlags()); | |||
| 4194 | ||||
| 4195 | if (ST->hasBMI()) { | |||
| 4196 | if (ST->is64Bit()) | |||
| 4197 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) | |||
| 4198 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4199 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4200 | ICA.getFlags()); | |||
| 4201 | ||||
| 4202 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) | |||
| 4203 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4204 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4205 | ICA.getFlags()); | |||
| 4206 | } | |||
| 4207 | ||||
| 4208 | if (ST->hasLZCNT()) { | |||
| 4209 | if (ST->is64Bit()) | |||
| 4210 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) | |||
| 4211 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4212 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4213 | ICA.getFlags()); | |||
| 4214 | ||||
| 4215 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) | |||
| 4216 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4217 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4218 | ICA.getFlags()); | |||
| 4219 | } | |||
| 4220 | ||||
| 4221 | if (ST->hasPOPCNT()) { | |||
| 4222 | if (ST->is64Bit()) | |||
| 4223 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) | |||
| 4224 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4225 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4226 | ICA.getFlags()); | |||
| 4227 | ||||
| 4228 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) | |||
| 4229 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4230 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4231 | ICA.getFlags()); | |||
| 4232 | } | |||
| 4233 | ||||
| 4234 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { | |||
| 4235 | if (const Instruction *II = ICA.getInst()) { | |||
| 4236 | if (II->hasOneUse() && isa<StoreInst>(II->user_back())) | |||
| 4237 | return TTI::TCC_Free; | |||
| 4238 | if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { | |||
| 4239 | if (LI->hasOneUse()) | |||
| 4240 | return TTI::TCC_Free; | |||
| 4241 | } | |||
| 4242 | } | |||
| 4243 | } | |||
| 4244 | ||||
| 4245 | if (ST->is64Bit()) | |||
| 4246 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) | |||
| 4247 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4248 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, | |||
| 4249 | ICA.getFlags()); | |||
| 4250 | ||||
| 4251 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) | |||
| 4252 | if (auto KindCost = Entry->Cost[CostKind]) | |||
| 4253 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); | |||
| 4254 | } | |||
| 4255 | ||||
| 4256 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); | |||
| 4257 | } | |||
| 4258 | ||||
| 4259 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, | |||
| 4260 | unsigned Index) { | |||
| 4261 | static const CostTblEntry SLMCostTbl[] = { | |||
| 4262 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, | |||
| 4263 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, | |||
| 4264 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, | |||
| 4265 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } | |||
| 4266 | }; | |||
| 4267 | ||||
| 4268 | assert(Val->isVectorTy() && "This must be a vector type")(static_cast <bool> (Val->isVectorTy() && "This must be a vector type" ) ? void (0) : __assert_fail ("Val->isVectorTy() && \"This must be a vector type\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4268, __extension__ __PRETTY_FUNCTION__)); | |||
| 4269 | Type *ScalarType = Val->getScalarType(); | |||
| 4270 | InstructionCost RegisterFileMoveCost = 0; | |||
| 4271 | TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; | |||
| 4272 | ||||
| 4273 | // Non-immediate extraction/insertion can be handled as a sequence of | |||
| 4274 | // aliased loads+stores via the stack. | |||
| 4275 | if (Index == -1U && (Opcode == Instruction::ExtractElement || | |||
| 4276 | Opcode == Instruction::InsertElement)) { | |||
| 4277 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: | |||
| 4278 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. | |||
| 4279 | ||||
| 4280 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. | |||
| 4281 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected")(static_cast <bool> (isa<FixedVectorType>(Val) && "Fixed vector type expected") ? void (0) : __assert_fail ("isa<FixedVectorType>(Val) && \"Fixed vector type expected\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4281, __extension__ __PRETTY_FUNCTION__)); | |||
| 4282 | Align VecAlign = DL.getPrefTypeAlign(Val); | |||
| 4283 | Align SclAlign = DL.getPrefTypeAlign(ScalarType); | |||
| 4284 | ||||
| 4285 | // Extract - store vector to stack, load scalar. | |||
| 4286 | if (Opcode == Instruction::ExtractElement) { | |||
| 4287 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + | |||
| 4288 | getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, | |||
| 4289 | CostKind); | |||
| 4290 | } | |||
| 4291 | // Insert - store vector to stack, store scalar, load vector. | |||
| 4292 | if (Opcode == Instruction::InsertElement) { | |||
| 4293 | return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + | |||
| 4294 | getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, | |||
| 4295 | CostKind) + | |||
| 4296 | getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind); | |||
| 4297 | } | |||
| 4298 | } | |||
| 4299 | ||||
| 4300 | if (Index != -1U && (Opcode
| |||
| 4301 | Opcode == Instruction::InsertElement)) { | |||
| 4302 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. | |||
| 4303 | if (Opcode
| |||
| 4304 | ScalarType->getScalarSizeInBits() == 1 && | |||
| 4305 | cast<FixedVectorType>(Val)->getNumElements() > 1) | |||
| 4306 | return 1; | |||
| 4307 | ||||
| 4308 | // Legalize the type. | |||
| 4309 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val); | |||
| 4310 | ||||
| 4311 | // This type is legalized to a scalar type. | |||
| 4312 | if (!LT.second.isVector()) | |||
| 4313 | return 0; | |||
| 4314 | ||||
| 4315 | // The type may be split. Normalize the index to the new type. | |||
| 4316 | unsigned SizeInBits = LT.second.getSizeInBits(); | |||
| 4317 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
| 4318 | unsigned SubNumElts = NumElts; | |||
| 4319 | Index = Index % NumElts; | |||
| 4320 | ||||
| 4321 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. | |||
| 4322 | // For inserts, we also need to insert the subvector back. | |||
| 4323 | if (SizeInBits > 128) { | |||
| 4324 | assert((SizeInBits % 128) == 0 && "Illegal vector")(static_cast <bool> ((SizeInBits % 128) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(SizeInBits % 128) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4324, __extension__ __PRETTY_FUNCTION__)); | |||
| 4325 | unsigned NumSubVecs = SizeInBits / 128; | |||
| 4326 | SubNumElts = NumElts / NumSubVecs; | |||
| 4327 | if (SubNumElts <= Index) { | |||
| 4328 | RegisterFileMoveCost += (Opcode
| |||
| 4329 | Index %= SubNumElts; | |||
| ||||
| 4330 | } | |||
| 4331 | } | |||
| 4332 | ||||
| 4333 | if (Index == 0) { | |||
| 4334 | // Floating point scalars are already located in index #0. | |||
| 4335 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume | |||
| 4336 | // true for all. | |||
| 4337 | if (ScalarType->isFloatingPointTy()) | |||
| 4338 | return RegisterFileMoveCost; | |||
| 4339 | ||||
| 4340 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. | |||
| 4341 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) | |||
| 4342 | return 1 + RegisterFileMoveCost; | |||
| 4343 | } | |||
| 4344 | ||||
| 4345 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 4346 | assert(ISD && "Unexpected vector opcode")(static_cast <bool> (ISD && "Unexpected vector opcode" ) ? void (0) : __assert_fail ("ISD && \"Unexpected vector opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4346, __extension__ __PRETTY_FUNCTION__)); | |||
| 4347 | MVT MScalarTy = LT.second.getScalarType(); | |||
| 4348 | if (ST->useSLMArithCosts()) | |||
| 4349 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) | |||
| 4350 | return Entry->Cost + RegisterFileMoveCost; | |||
| 4351 | ||||
| 4352 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. | |||
| 4353 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
| 4354 | (MScalarTy.isInteger() && ST->hasSSE41())) | |||
| 4355 | return 1 + RegisterFileMoveCost; | |||
| 4356 | ||||
| 4357 | // Assume insertps is relatively cheap on all targets. | |||
| 4358 | if (MScalarTy == MVT::f32 && ST->hasSSE41() && | |||
| 4359 | Opcode == Instruction::InsertElement) | |||
| 4360 | return 1 + RegisterFileMoveCost; | |||
| 4361 | ||||
| 4362 | // For extractions we just need to shuffle the element to index 0, which | |||
| 4363 | // should be very cheap (assume cost = 1). For insertions we need to shuffle | |||
| 4364 | // the elements to its destination. In both cases we must handle the | |||
| 4365 | // subvector move(s). | |||
| 4366 | // If the vector type is already less than 128-bits then don't reduce it. | |||
| 4367 | // TODO: Under what circumstances should we shuffle using the full width? | |||
| 4368 | InstructionCost ShuffleCost = 1; | |||
| 4369 | if (Opcode == Instruction::InsertElement) { | |||
| 4370 | auto *SubTy = cast<VectorType>(Val); | |||
| 4371 | EVT VT = TLI->getValueType(DL, Val); | |||
| 4372 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) | |||
| 4373 | SubTy = FixedVectorType::get(ScalarType, SubNumElts); | |||
| 4374 | ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, std::nullopt, | |||
| 4375 | CostKind, 0, SubTy); | |||
| 4376 | } | |||
| 4377 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; | |||
| 4378 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; | |||
| 4379 | } | |||
| 4380 | ||||
| 4381 | // Add to the base cost if we know that the extracted element of a vector is | |||
| 4382 | // destined to be moved to and used in the integer register file. | |||
| 4383 | if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) | |||
| 4384 | RegisterFileMoveCost += 1; | |||
| 4385 | ||||
| 4386 | return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; | |||
| 4387 | } | |||
| 4388 | ||||
| 4389 | InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, | |||
| 4390 | const APInt &DemandedElts, | |||
| 4391 | bool Insert, | |||
| 4392 | bool Extract) { | |||
| 4393 | assert(DemandedElts.getBitWidth() ==(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4395, __extension__ __PRETTY_FUNCTION__)) | |||
| 4394 | cast<FixedVectorType>(Ty)->getNumElements() &&(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4395, __extension__ __PRETTY_FUNCTION__)) | |||
| 4395 | "Vector size mismatch")(static_cast <bool> (DemandedElts.getBitWidth() == cast <FixedVectorType>(Ty)->getNumElements() && "Vector size mismatch" ) ? void (0) : __assert_fail ("DemandedElts.getBitWidth() == cast<FixedVectorType>(Ty)->getNumElements() && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4395, __extension__ __PRETTY_FUNCTION__)); | |||
| 4396 | ||||
| 4397 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
| 4398 | MVT MScalarTy = LT.second.getScalarType(); | |||
| 4399 | unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); | |||
| 4400 | TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; | |||
| 4401 | InstructionCost Cost = 0; | |||
| 4402 | ||||
| 4403 | constexpr unsigned LaneBitWidth = 128; | |||
| 4404 | assert((LegalVectorBitWidth < LaneBitWidth ||(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4406, __extension__ __PRETTY_FUNCTION__)) | |||
| 4405 | (LegalVectorBitWidth % LaneBitWidth) == 0) &&(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4406, __extension__ __PRETTY_FUNCTION__)) | |||
| 4406 | "Illegal vector")(static_cast <bool> ((LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && "Illegal vector" ) ? void (0) : __assert_fail ("(LegalVectorBitWidth < LaneBitWidth || (LegalVectorBitWidth % LaneBitWidth) == 0) && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4406, __extension__ __PRETTY_FUNCTION__)); | |||
| 4407 | ||||
| 4408 | const int NumLegalVectors = *LT.first.getValue(); | |||
| 4409 | assert(NumLegalVectors >= 0 && "Negative cost!")(static_cast <bool> (NumLegalVectors >= 0 && "Negative cost!") ? void (0) : __assert_fail ("NumLegalVectors >= 0 && \"Negative cost!\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4409, __extension__ __PRETTY_FUNCTION__)); | |||
| 4410 | ||||
| 4411 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much | |||
| 4412 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. | |||
| 4413 | if (Insert) { | |||
| 4414 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || | |||
| 4415 | (MScalarTy.isInteger() && ST->hasSSE41()) || | |||
| 4416 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { | |||
| 4417 | // For types we can insert directly, insertion into 128-bit sub vectors is | |||
| 4418 | // cheap, followed by a cheap chain of concatenations. | |||
| 4419 | if (LegalVectorBitWidth <= LaneBitWidth) { | |||
| 4420 | Cost += | |||
| 4421 | BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); | |||
| 4422 | } else { | |||
| 4423 | // In each 128-lane, if at least one index is demanded but not all | |||
| 4424 | // indices are demanded and this 128-lane is not the first 128-lane of | |||
| 4425 | // the legalized-vector, then this 128-lane needs a extracti128; If in | |||
| 4426 | // each 128-lane, there is at least one demanded index, this 128-lane | |||
| 4427 | // needs a inserti128. | |||
| 4428 | ||||
| 4429 | // The following cases will help you build a better understanding: | |||
| 4430 | // Assume we insert several elements into a v8i32 vector in avx2, | |||
| 4431 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. | |||
| 4432 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + | |||
| 4433 | // inserti128. | |||
| 4434 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. | |||
| 4435 | assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector")(static_cast <bool> ((LegalVectorBitWidth % LaneBitWidth ) == 0 && "Illegal vector") ? void (0) : __assert_fail ("(LegalVectorBitWidth % LaneBitWidth) == 0 && \"Illegal vector\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4435, __extension__ __PRETTY_FUNCTION__)); | |||
| 4436 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; | |||
| 4437 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; | |||
| 4438 | unsigned NumLegalElts = | |||
| 4439 | LT.second.getVectorNumElements() * NumLegalVectors; | |||
| 4440 | assert(NumLegalElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4441, __extension__ __PRETTY_FUNCTION__)) | |||
| 4441 | "Vector has been legalized to smaller element count")(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4441, __extension__ __PRETTY_FUNCTION__)); | |||
| 4442 | assert((NumLegalElts % NumLanesTotal) == 0 &&(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4443, __extension__ __PRETTY_FUNCTION__)) | |||
| 4443 | "Unexpected elts per lane")(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4443, __extension__ __PRETTY_FUNCTION__)); | |||
| 4444 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; | |||
| 4445 | ||||
| 4446 | APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); | |||
| 4447 | auto *LaneTy = | |||
| 4448 | FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); | |||
| 4449 | ||||
| 4450 | for (unsigned I = 0; I != NumLanesTotal; ++I) { | |||
| 4451 | APInt LaneEltMask = WidenedDemandedElts.extractBits( | |||
| 4452 | NumEltsPerLane, NumEltsPerLane * I); | |||
| 4453 | if (LaneEltMask.isNullValue()) | |||
| 4454 | continue; | |||
| 4455 | // FIXME: we don't need to extract if all non-demanded elements | |||
| 4456 | // are legalization-inserted padding. | |||
| 4457 | if (!LaneEltMask.isAllOnes()) | |||
| 4458 | Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | |||
| 4459 | CostKind, I * NumEltsPerLane, LaneTy); | |||
| 4460 | Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, | |||
| 4461 | false); | |||
| 4462 | } | |||
| 4463 | ||||
| 4464 | APInt AffectedLanes = | |||
| 4465 | APIntOps::ScaleBitMask(WidenedDemandedElts, NumLanesTotal); | |||
| 4466 | APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( | |||
| 4467 | AffectedLanes, NumLegalVectors, /*MatchAllBits=*/true); | |||
| 4468 | for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { | |||
| 4469 | for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { | |||
| 4470 | unsigned I = NumLegalLanes * LegalVec + Lane; | |||
| 4471 | // No need to insert unaffected lane; or lane 0 of each legal vector | |||
| 4472 | // iff ALL lanes of that vector were affected and will be inserted. | |||
| 4473 | if (!AffectedLanes[I] || | |||
| 4474 | (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) | |||
| 4475 | continue; | |||
| 4476 | Cost += getShuffleCost(TTI::SK_InsertSubvector, Ty, std::nullopt, | |||
| 4477 | CostKind, I * NumEltsPerLane, LaneTy); | |||
| 4478 | } | |||
| 4479 | } | |||
| 4480 | } | |||
| 4481 | } else if (LT.second.isVector()) { | |||
| 4482 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded | |||
| 4483 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a | |||
| 4484 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be | |||
| 4485 | // considered cheap. | |||
| 4486 | if (Ty->isIntOrIntVectorTy()) | |||
| 4487 | Cost += DemandedElts.countPopulation(); | |||
| 4488 | ||||
| 4489 | // Get the smaller of the legalized or original pow2-extended number of | |||
| 4490 | // vector elements, which represents the number of unpacks we'll end up | |||
| 4491 | // performing. | |||
| 4492 | unsigned NumElts = LT.second.getVectorNumElements(); | |||
| 4493 | unsigned Pow2Elts = | |||
| 4494 | PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements()); | |||
| 4495 | Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first; | |||
| 4496 | } | |||
| 4497 | } | |||
| 4498 | ||||
| 4499 | if (Extract) { | |||
| 4500 | // vXi1 can be efficiently extracted with MOVMSK. | |||
| 4501 | // TODO: AVX512 predicate mask handling. | |||
| 4502 | // NOTE: This doesn't work well for roundtrip scalarization. | |||
| 4503 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { | |||
| 4504 | unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements(); | |||
| 4505 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; | |||
| 4506 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; | |||
| 4507 | return MOVMSKCost; | |||
| 4508 | } | |||
| 4509 | ||||
| 4510 | if (LT.second.isVector()) { | |||
| 4511 | unsigned NumLegalElts = | |||
| 4512 | LT.second.getVectorNumElements() * NumLegalVectors; | |||
| 4513 | assert(NumLegalElts >= DemandedElts.getBitWidth() &&(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4514, __extension__ __PRETTY_FUNCTION__)) | |||
| 4514 | "Vector has been legalized to smaller element count")(static_cast <bool> (NumLegalElts >= DemandedElts.getBitWidth () && "Vector has been legalized to smaller element count" ) ? void (0) : __assert_fail ("NumLegalElts >= DemandedElts.getBitWidth() && \"Vector has been legalized to smaller element count\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4514, __extension__ __PRETTY_FUNCTION__)); | |||
| 4515 | ||||
| 4516 | // If we're extracting elements from a 128-bit subvector lane, | |||
| 4517 | // we only need to extract each lane once, not for every element. | |||
| 4518 | if (LegalVectorBitWidth > LaneBitWidth) { | |||
| 4519 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; | |||
| 4520 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; | |||
| 4521 | assert((NumLegalElts % NumLanesTotal) == 0 &&(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4522, __extension__ __PRETTY_FUNCTION__)) | |||
| 4522 | "Unexpected elts per lane")(static_cast <bool> ((NumLegalElts % NumLanesTotal) == 0 && "Unexpected elts per lane") ? void (0) : __assert_fail ("(NumLegalElts % NumLanesTotal) == 0 && \"Unexpected elts per lane\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4522, __extension__ __PRETTY_FUNCTION__)); | |||
| 4523 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; | |||
| 4524 | ||||
| 4525 | // Add cost for each demanded 128-bit subvector extraction. | |||
| 4526 | // Luckily this is a lot easier than for insertion. | |||
| 4527 | APInt WidenedDemandedElts = DemandedElts.zext(NumLegalElts); | |||
| 4528 | auto *LaneTy = | |||
| 4529 | FixedVectorType::get(Ty->getElementType(), NumEltsPerLane); | |||
| 4530 | ||||
| 4531 | for (unsigned I = 0; I != NumLanesTotal; ++I) { | |||
| 4532 | APInt LaneEltMask = WidenedDemandedElts.extractBits( | |||
| 4533 | NumEltsPerLane, I * NumEltsPerLane); | |||
| 4534 | if (LaneEltMask.isNullValue()) | |||
| 4535 | continue; | |||
| 4536 | Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | |||
| 4537 | CostKind, I * NumEltsPerLane, LaneTy); | |||
| 4538 | Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, false, | |||
| 4539 | Extract); | |||
| 4540 | } | |||
| 4541 | ||||
| 4542 | return Cost; | |||
| 4543 | } | |||
| 4544 | } | |||
| 4545 | ||||
| 4546 | // Fallback to default extraction. | |||
| 4547 | Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); | |||
| 4548 | } | |||
| 4549 | ||||
| 4550 | return Cost; | |||
| 4551 | } | |||
| 4552 | ||||
| 4553 | InstructionCost | |||
| 4554 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, | |||
| 4555 | int VF, const APInt &DemandedDstElts, | |||
| 4556 | TTI::TargetCostKind CostKind) { | |||
| 4557 | const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); | |||
| 4558 | // We don't differentiate element types here, only element bit width. | |||
| 4559 | EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); | |||
| 4560 | ||||
| 4561 | auto bailout = [&]() { | |||
| 4562 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, | |||
| 4563 | DemandedDstElts, CostKind); | |||
| 4564 | }; | |||
| 4565 | ||||
| 4566 | // For now, only deal with AVX512 cases. | |||
| 4567 | if (!ST->hasAVX512()) | |||
| 4568 | return bailout(); | |||
| 4569 | ||||
| 4570 | // Do we have a native shuffle for this element type, or should we promote? | |||
| 4571 | unsigned PromEltTyBits = EltTyBits; | |||
| 4572 | switch (EltTyBits) { | |||
| 4573 | case 32: | |||
| 4574 | case 64: | |||
| 4575 | break; // AVX512F. | |||
| 4576 | case 16: | |||
| 4577 | if (!ST->hasBWI()) | |||
| 4578 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
| 4579 | break; // AVX512BW | |||
| 4580 | case 8: | |||
| 4581 | if (!ST->hasVBMI()) | |||
| 4582 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
| 4583 | break; // AVX512VBMI | |||
| 4584 | case 1: | |||
| 4585 | // There is no support for shuffling i1 elements. We *must* promote. | |||
| 4586 | if (ST->hasBWI()) { | |||
| 4587 | if (ST->hasVBMI()) | |||
| 4588 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. | |||
| 4589 | else | |||
| 4590 | PromEltTyBits = 16; // promote to i16, AVX512BW. | |||
| 4591 | break; | |||
| 4592 | } | |||
| 4593 | PromEltTyBits = 32; // promote to i32, AVX512F. | |||
| 4594 | break; | |||
| 4595 | default: | |||
| 4596 | return bailout(); | |||
| 4597 | } | |||
| 4598 | auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); | |||
| 4599 | ||||
| 4600 | auto *SrcVecTy = FixedVectorType::get(EltTy, VF); | |||
| 4601 | auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); | |||
| 4602 | ||||
| 4603 | int NumDstElements = VF * ReplicationFactor; | |||
| 4604 | auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); | |||
| 4605 | auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); | |||
| 4606 | ||||
| 4607 | // Legalize the types. | |||
| 4608 | MVT LegalSrcVecTy = getTypeLegalizationCost(SrcVecTy).second; | |||
| 4609 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(PromSrcVecTy).second; | |||
| 4610 | MVT LegalPromDstVecTy = getTypeLegalizationCost(PromDstVecTy).second; | |||
| 4611 | MVT LegalDstVecTy = getTypeLegalizationCost(DstVecTy).second; | |||
| 4612 | // They should have legalized into vector types. | |||
| 4613 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || | |||
| 4614 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) | |||
| 4615 | return bailout(); | |||
| 4616 | ||||
| 4617 | if (PromEltTyBits != EltTyBits) { | |||
| 4618 | // If we have to perform the shuffle with wider elt type than our data type, | |||
| 4619 | // then we will first need to anyext (we don't care about the new bits) | |||
| 4620 | // the source elements, and then truncate Dst elements. | |||
| 4621 | InstructionCost PromotionCost; | |||
| 4622 | PromotionCost += getCastInstrCost( | |||
| 4623 | Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, | |||
| 4624 | TargetTransformInfo::CastContextHint::None, CostKind); | |||
| 4625 | PromotionCost += | |||
| 4626 | getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, | |||
| 4627 | /*Src=*/PromDstVecTy, | |||
| 4628 | TargetTransformInfo::CastContextHint::None, CostKind); | |||
| 4629 | return PromotionCost + getReplicationShuffleCost(PromEltTy, | |||
| 4630 | ReplicationFactor, VF, | |||
| 4631 | DemandedDstElts, CostKind); | |||
| 4632 | } | |||
| 4633 | ||||
| 4634 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)) | |||
| 4635 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() &&(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)) | |||
| 4636 | "We expect that the legalization doesn't affect the element width, "(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)) | |||
| 4637 | "doesn't coalesce/split elements.")(static_cast <bool> (LegalSrcVecTy.getScalarSizeInBits( ) == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy .getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements.") ? void (0) : __assert_fail ("LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && \"We expect that the legalization doesn't affect the element width, \" \"doesn't coalesce/split elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4637, __extension__ __PRETTY_FUNCTION__)); | |||
| 4638 | ||||
| 4639 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); | |||
| 4640 | unsigned NumDstVectors = | |||
| 4641 | divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); | |||
| 4642 | ||||
| 4643 | auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); | |||
| 4644 | ||||
| 4645 | // Not all the produced Dst elements may be demanded. In our case, | |||
| 4646 | // given that a single Dst vector is formed by a single shuffle, | |||
| 4647 | // if all elements that will form a single Dst vector aren't demanded, | |||
| 4648 | // then we won't need to do that shuffle, so adjust the cost accordingly. | |||
| 4649 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( | |||
| 4650 | DemandedDstElts.zext(NumDstVectors * NumEltsPerDstVec), NumDstVectors); | |||
| 4651 | unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); | |||
| 4652 | ||||
| 4653 | InstructionCost SingleShuffleCost = getShuffleCost( | |||
| 4654 | TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/std::nullopt, CostKind, | |||
| 4655 | /*Index=*/0, /*SubTp=*/nullptr); | |||
| 4656 | return NumDstVectorsDemanded * SingleShuffleCost; | |||
| 4657 | } | |||
| 4658 | ||||
| 4659 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, | |||
| 4660 | MaybeAlign Alignment, | |||
| 4661 | unsigned AddressSpace, | |||
| 4662 | TTI::TargetCostKind CostKind, | |||
| 4663 | TTI::OperandValueInfo OpInfo, | |||
| 4664 | const Instruction *I) { | |||
| 4665 | // TODO: Handle other cost kinds. | |||
| 4666 | if (CostKind != TTI::TCK_RecipThroughput) { | |||
| 4667 | if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { | |||
| 4668 | // Store instruction with index and scale costs 2 Uops. | |||
| 4669 | // Check the preceding GEP to identify non-const indices. | |||
| 4670 | if (auto *GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand())) { | |||
| 4671 | if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) | |||
| 4672 | return TTI::TCC_Basic * 2; | |||
| 4673 | } | |||
| 4674 | } | |||
| 4675 | return TTI::TCC_Basic; | |||
| 4676 | } | |||
| 4677 | ||||
| 4678 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4679, __extension__ __PRETTY_FUNCTION__)) | |||
| 4679 | "Invalid Opcode")(static_cast <bool> ((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode") ? void ( 0) : __assert_fail ("(Opcode == Instruction::Load || Opcode == Instruction::Store) && \"Invalid Opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4679, __extension__ __PRETTY_FUNCTION__)); | |||
| 4680 | // Type legalization can't handle structs | |||
| 4681 | if (TLI->getValueType(DL, Src, true) == MVT::Other) | |||
| 4682 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
| 4683 | CostKind); | |||
| 4684 | ||||
| 4685 | // Legalize the type. | |||
| 4686 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); | |||
| 4687 | ||||
| 4688 | auto *VTy = dyn_cast<FixedVectorType>(Src); | |||
| 4689 | ||||
| 4690 | InstructionCost Cost = 0; | |||
| 4691 | ||||
| 4692 | // Add a cost for constant load to vector. | |||
| 4693 | if (Opcode == Instruction::Store && OpInfo.isConstant()) | |||
| 4694 | Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), | |||
| 4695 | /*AddressSpace=*/0, CostKind); | |||
| 4696 | ||||
| 4697 | // Handle the simple case of non-vectors. | |||
| 4698 | // NOTE: this assumes that legalization never creates vector from scalars! | |||
| 4699 | if (!VTy || !LT.second.isVector()) { | |||
| 4700 | // Each load/store unit costs 1. | |||
| 4701 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; | |||
| 4702 | } | |||
| 4703 | ||||
| 4704 | bool IsLoad = Opcode == Instruction::Load; | |||
| 4705 | ||||
| 4706 | Type *EltTy = VTy->getElementType(); | |||
| 4707 | ||||
| 4708 | const int EltTyBits = DL.getTypeSizeInBits(EltTy); | |||
| 4709 | ||||
| 4710 | // Source of truth: how many elements were there in the original IR vector? | |||
| 4711 | const unsigned SrcNumElt = VTy->getNumElements(); | |||
| 4712 | ||||
| 4713 | // How far have we gotten? | |||
| 4714 | int NumEltRemaining = SrcNumElt; | |||
| 4715 | // Note that we intentionally capture by-reference, NumEltRemaining changes. | |||
| 4716 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; | |||
| 4717 | ||||
| 4718 | const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); | |||
| 4719 | ||||
| 4720 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. | |||
| 4721 | const unsigned XMMBits = 128; | |||
| 4722 | if (XMMBits % EltTyBits != 0) | |||
| 4723 | // Vector size must be a multiple of the element size. I.e. no padding. | |||
| 4724 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
| 4725 | CostKind); | |||
| 4726 | const int NumEltPerXMM = XMMBits / EltTyBits; | |||
| 4727 | ||||
| 4728 | auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); | |||
| 4729 | ||||
| 4730 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; | |||
| 4731 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { | |||
| 4732 | // How many elements would a single op deal with at once? | |||
| 4733 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) | |||
| 4734 | // Vector size must be a multiple of the element size. I.e. no padding. | |||
| 4735 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, | |||
| 4736 | CostKind); | |||
| 4737 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; | |||
| 4738 | ||||
| 4739 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?")(static_cast <bool> (CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?") ? void (0) : __assert_fail ("CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && \"How'd we get here?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4739, __extension__ __PRETTY_FUNCTION__)); | |||
| 4740 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)) | |||
| 4741 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)) | |||
| 4742 | "Unless we haven't halved the op size yet, "(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)) | |||
| 4743 | "we have less than two op's sized units of work left.")(static_cast <bool> ((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes )) && "Unless we haven't halved the op size yet, " "we have less than two op's sized units of work left." ) ? void (0) : __assert_fail ("(((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && \"Unless we haven't halved the op size yet, \" \"we have less than two op's sized units of work left.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4743, __extension__ __PRETTY_FUNCTION__)); | |||
| 4744 | ||||
| 4745 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM | |||
| 4746 | ? FixedVectorType::get(EltTy, CurrNumEltPerOp) | |||
| 4747 | : XMMVecTy; | |||
| 4748 | ||||
| 4749 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4751, __extension__ __PRETTY_FUNCTION__)) | |||
| 4750 | "After halving sizes, the vector elt count is no longer a multiple "(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4751, __extension__ __PRETTY_FUNCTION__)) | |||
| 4751 | "of number of elements per operation?")(static_cast <bool> (CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && "After halving sizes, the vector elt count is no longer a multiple " "of number of elements per operation?") ? void (0) : __assert_fail ("CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && \"After halving sizes, the vector elt count is no longer a multiple \" \"of number of elements per operation?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4751, __extension__ __PRETTY_FUNCTION__)); | |||
| 4752 | auto *CoalescedVecTy = | |||
| 4753 | CurrNumEltPerOp == 1 | |||
| 4754 | ? CurrVecTy | |||
| 4755 | : FixedVectorType::get( | |||
| 4756 | IntegerType::get(Src->getContext(), | |||
| 4757 | EltTyBits * CurrNumEltPerOp), | |||
| 4758 | CurrVecTy->getNumElements() / CurrNumEltPerOp); | |||
| 4759 | assert(DL.getTypeSizeInBits(CoalescedVecTy) ==(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4761, __extension__ __PRETTY_FUNCTION__)) | |||
| 4760 | DL.getTypeSizeInBits(CurrVecTy) &&(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4761, __extension__ __PRETTY_FUNCTION__)) | |||
| 4761 | "coalesciing elements doesn't change vector width.")(static_cast <bool> (DL.getTypeSizeInBits(CoalescedVecTy ) == DL.getTypeSizeInBits(CurrVecTy) && "coalesciing elements doesn't change vector width." ) ? void (0) : __assert_fail ("DL.getTypeSizeInBits(CoalescedVecTy) == DL.getTypeSizeInBits(CurrVecTy) && \"coalesciing elements doesn't change vector width.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4761, __extension__ __PRETTY_FUNCTION__)); | |||
| 4762 | ||||
| 4763 | while (NumEltRemaining > 0) { | |||
| 4764 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?")(static_cast <bool> (SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ) ? void (0) : __assert_fail ("SubVecEltsLeft >= 0 && \"Subreg element count overconsumtion?\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4764, __extension__ __PRETTY_FUNCTION__)); | |||
| 4765 | ||||
| 4766 | // Can we use this vector size, as per the remaining element count? | |||
| 4767 | // Iff the vector is naturally aligned, we can do a wide load regardless. | |||
| 4768 | if (NumEltRemaining < CurrNumEltPerOp && | |||
| 4769 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && | |||
| 4770 | CurrOpSizeBytes != 1) | |||
| 4771 | break; // Try smalled vector size. | |||
| 4772 | ||||
| 4773 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; | |||
| 4774 | ||||
| 4775 | // If we have fully processed the previous reg, we need to replenish it. | |||
| 4776 | if (SubVecEltsLeft == 0) { | |||
| 4777 | SubVecEltsLeft += CurrVecTy->getNumElements(); | |||
| 4778 | // And that's free only for the 0'th subvector of a legalized vector. | |||
| 4779 | if (!Is0thSubVec) | |||
| 4780 | Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector | |||
| 4781 | : TTI::ShuffleKind::SK_ExtractSubvector, | |||
| 4782 | VTy, std::nullopt, CostKind, NumEltDone(), | |||
| 4783 | CurrVecTy); | |||
| 4784 | } | |||
| 4785 | ||||
| 4786 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, | |||
| 4787 | // for smaller widths (32/16/8) we have to insert/extract them separately. | |||
| 4788 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, | |||
| 4789 | // but let's pretend that it is also true for 16/8 bit wide ops...) | |||
| 4790 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { | |||
| 4791 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; | |||
| 4792 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "")(static_cast <bool> (NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "") ? void (0) : __assert_fail ("NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && \"\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4792, __extension__ __PRETTY_FUNCTION__)); | |||
| 4793 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; | |||
| 4794 | APInt DemandedElts = | |||
| 4795 | APInt::getBitsSet(CoalescedVecTy->getNumElements(), | |||
| 4796 | CoalescedVecEltIdx, CoalescedVecEltIdx + 1); | |||
| 4797 | assert(DemandedElts.countPopulation() == 1 && "Inserting single value")(static_cast <bool> (DemandedElts.countPopulation() == 1 && "Inserting single value") ? void (0) : __assert_fail ("DemandedElts.countPopulation() == 1 && \"Inserting single value\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4797, __extension__ __PRETTY_FUNCTION__)); | |||
| 4798 | Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, | |||
| 4799 | !IsLoad); | |||
| 4800 | } | |||
| 4801 | ||||
| 4802 | // This isn't exactly right. We're using slow unaligned 32-byte accesses | |||
| 4803 | // as a proxy for a double-pumped AVX memory interface such as on | |||
| 4804 | // Sandybridge. | |||
| 4805 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) | |||
| 4806 | Cost += 2; | |||
| 4807 | else | |||
| 4808 | Cost += 1; | |||
| 4809 | ||||
| 4810 | SubVecEltsLeft -= CurrNumEltPerOp; | |||
| 4811 | NumEltRemaining -= CurrNumEltPerOp; | |||
| 4812 | Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); | |||
| 4813 | } | |||
| 4814 | } | |||
| 4815 | ||||
| 4816 | assert(NumEltRemaining <= 0 && "Should have processed all the elements.")(static_cast <bool> (NumEltRemaining <= 0 && "Should have processed all the elements.") ? void (0) : __assert_fail ("NumEltRemaining <= 0 && \"Should have processed all the elements.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4816, __extension__ __PRETTY_FUNCTION__)); | |||
| 4817 | ||||
| 4818 | return Cost; | |||
| 4819 | } | |||
| 4820 | ||||
| 4821 | InstructionCost | |||
| 4822 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, | |||
| 4823 | unsigned AddressSpace, | |||
| 4824 | TTI::TargetCostKind CostKind) { | |||
| 4825 | bool IsLoad = (Instruction::Load == Opcode); | |||
| 4826 | bool IsStore = (Instruction::Store == Opcode); | |||
| 4827 | ||||
| 4828 | auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy); | |||
| 4829 | if (!SrcVTy) | |||
| 4830 | // To calculate scalar take the regular cost, without mask | |||
| 4831 | return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind); | |||
| 4832 | ||||
| 4833 | unsigned NumElem = SrcVTy->getNumElements(); | |||
| 4834 | auto *MaskTy = | |||
| 4835 | FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); | |||
| 4836 | if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || | |||
| 4837 | (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { | |||
| 4838 | // Scalarization | |||
| 4839 | APInt DemandedElts = APInt::getAllOnes(NumElem); | |||
| 4840 | InstructionCost MaskSplitCost = | |||
| 4841 | getScalarizationOverhead(MaskTy, DemandedElts, false, true); | |||
| 4842 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | |||
| 4843 | Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, | |||
| 4844 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
| 4845 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | |||
| 4846 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); | |||
| 4847 | InstructionCost ValueSplitCost = | |||
| 4848 | getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); | |||
| 4849 | InstructionCost MemopCost = | |||
| 4850 | NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
| 4851 | Alignment, AddressSpace, CostKind); | |||
| 4852 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; | |||
| 4853 | } | |||
| 4854 | ||||
| 4855 | // Legalize the type. | |||
| 4856 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy); | |||
| 4857 | auto VT = TLI->getValueType(DL, SrcVTy); | |||
| 4858 | InstructionCost Cost = 0; | |||
| 4859 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && | |||
| 4860 | LT.second.getVectorNumElements() == NumElem) | |||
| 4861 | // Promotion requires extend/truncate for data and a shuffle for mask. | |||
| 4862 | Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt, | |||
| 4863 | CostKind, 0, nullptr) + | |||
| 4864 | getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt, | |||
| 4865 | CostKind, 0, nullptr); | |||
| 4866 | ||||
| 4867 | else if (LT.first * LT.second.getVectorNumElements() > NumElem) { | |||
| 4868 | auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), | |||
| 4869 | LT.second.getVectorNumElements()); | |||
| 4870 | // Expanding requires fill mask with zeroes | |||
| 4871 | Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt, | |||
| 4872 | CostKind, 0, MaskTy); | |||
| 4873 | } | |||
| 4874 | ||||
| 4875 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. | |||
| 4876 | if (!ST->hasAVX512()) | |||
| 4877 | return Cost + LT.first * (IsLoad ? 2 : 8); | |||
| 4878 | ||||
| 4879 | // AVX-512 masked load/store is cheaper | |||
| 4880 | return Cost + LT.first; | |||
| 4881 | } | |||
| 4882 | ||||
| 4883 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, | |||
| 4884 | ScalarEvolution *SE, | |||
| 4885 | const SCEV *Ptr) { | |||
| 4886 | // Address computations in vectorized code with non-consecutive addresses will | |||
| 4887 | // likely result in more instructions compared to scalar code where the | |||
| 4888 | // computation can more often be merged into the index mode. The resulting | |||
| 4889 | // extra micro-ops can significantly decrease throughput. | |||
| 4890 | const unsigned NumVectorInstToHideOverhead = 10; | |||
| 4891 | ||||
| 4892 | // Cost modeling of Strided Access Computation is hidden by the indexing | |||
| 4893 | // modes of X86 regardless of the stride value. We dont believe that there | |||
| 4894 | // is a difference between constant strided access in gerenal and constant | |||
| 4895 | // strided value which is less than or equal to 64. | |||
| 4896 | // Even in the case of (loop invariant) stride whose value is not known at | |||
| 4897 | // compile time, the address computation will not incur more than one extra | |||
| 4898 | // ADD instruction. | |||
| 4899 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { | |||
| 4900 | // TODO: AVX2 is the current cut-off because we don't have correct | |||
| 4901 | // interleaving costs for prior ISA's. | |||
| 4902 | if (!BaseT::isStridedAccess(Ptr)) | |||
| 4903 | return NumVectorInstToHideOverhead; | |||
| 4904 | if (!BaseT::getConstantStrideStep(SE, Ptr)) | |||
| 4905 | return 1; | |||
| 4906 | } | |||
| 4907 | ||||
| 4908 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); | |||
| 4909 | } | |||
| 4910 | ||||
| 4911 | InstructionCost | |||
| 4912 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, | |||
| 4913 | std::optional<FastMathFlags> FMF, | |||
| 4914 | TTI::TargetCostKind CostKind) { | |||
| 4915 | if (TTI::requiresOrderedReduction(FMF)) | |||
| 4916 | return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); | |||
| 4917 | ||||
| 4918 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
| 4919 | // and make it as the cost. | |||
| 4920 | ||||
| 4921 | static const CostTblEntry SLMCostTblNoPairWise[] = { | |||
| 4922 | { ISD::FADD, MVT::v2f64, 3 }, | |||
| 4923 | { ISD::ADD, MVT::v2i64, 5 }, | |||
| 4924 | }; | |||
| 4925 | ||||
| 4926 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
| 4927 | { ISD::FADD, MVT::v2f64, 2 }, | |||
| 4928 | { ISD::FADD, MVT::v2f32, 2 }, | |||
| 4929 | { ISD::FADD, MVT::v4f32, 4 }, | |||
| 4930 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". | |||
| 4931 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 | |||
| 4932 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". | |||
| 4933 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". | |||
| 4934 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". | |||
| 4935 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". | |||
| 4936 | { ISD::ADD, MVT::v2i8, 2 }, | |||
| 4937 | { ISD::ADD, MVT::v4i8, 2 }, | |||
| 4938 | { ISD::ADD, MVT::v8i8, 2 }, | |||
| 4939 | { ISD::ADD, MVT::v16i8, 3 }, | |||
| 4940 | }; | |||
| 4941 | ||||
| 4942 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
| 4943 | { ISD::FADD, MVT::v4f64, 3 }, | |||
| 4944 | { ISD::FADD, MVT::v4f32, 3 }, | |||
| 4945 | { ISD::FADD, MVT::v8f32, 4 }, | |||
| 4946 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". | |||
| 4947 | { ISD::ADD, MVT::v4i64, 3 }, | |||
| 4948 | { ISD::ADD, MVT::v8i32, 5 }, | |||
| 4949 | { ISD::ADD, MVT::v16i16, 5 }, | |||
| 4950 | { ISD::ADD, MVT::v32i8, 4 }, | |||
| 4951 | }; | |||
| 4952 | ||||
| 4953 | int ISD = TLI->InstructionOpcodeToISD(Opcode); | |||
| 4954 | assert(ISD && "Invalid opcode")(static_cast <bool> (ISD && "Invalid opcode") ? void (0) : __assert_fail ("ISD && \"Invalid opcode\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 4954, __extension__ __PRETTY_FUNCTION__)); | |||
| 4955 | ||||
| 4956 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
| 4957 | // in the table. | |||
| 4958 | // FIXME: Is there a better way to do this? | |||
| 4959 | EVT VT = TLI->getValueType(DL, ValTy); | |||
| 4960 | if (VT.isSimple()) { | |||
| 4961 | MVT MTy = VT.getSimpleVT(); | |||
| 4962 | if (ST->useSLMArithCosts()) | |||
| 4963 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
| 4964 | return Entry->Cost; | |||
| 4965 | ||||
| 4966 | if (ST->hasAVX()) | |||
| 4967 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
| 4968 | return Entry->Cost; | |||
| 4969 | ||||
| 4970 | if (ST->hasSSE2()) | |||
| 4971 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
| 4972 | return Entry->Cost; | |||
| 4973 | } | |||
| 4974 | ||||
| 4975 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
| 4976 | ||||
| 4977 | MVT MTy = LT.second; | |||
| 4978 | ||||
| 4979 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
| 4980 | ||||
| 4981 | // Special case: vXi8 mul reductions are performed as vXi16. | |||
| 4982 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { | |||
| 4983 | auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); | |||
| 4984 | auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); | |||
| 4985 | return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, | |||
| 4986 | TargetTransformInfo::CastContextHint::None, | |||
| 4987 | CostKind) + | |||
| 4988 | getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); | |||
| 4989 | } | |||
| 4990 | ||||
| 4991 | InstructionCost ArithmeticCost = 0; | |||
| 4992 | if (LT.first != 1 && MTy.isVector() && | |||
| 4993 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
| 4994 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
| 4995 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
| 4996 | MTy.getVectorNumElements()); | |||
| 4997 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
| 4998 | ArithmeticCost *= LT.first - 1; | |||
| 4999 | } | |||
| 5000 | ||||
| 5001 | if (ST->useSLMArithCosts()) | |||
| 5002 | if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) | |||
| 5003 | return ArithmeticCost + Entry->Cost; | |||
| 5004 | ||||
| 5005 | if (ST->hasAVX()) | |||
| 5006 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
| 5007 | return ArithmeticCost + Entry->Cost; | |||
| 5008 | ||||
| 5009 | if (ST->hasSSE2()) | |||
| 5010 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
| 5011 | return ArithmeticCost + Entry->Cost; | |||
| 5012 | ||||
| 5013 | // FIXME: These assume a naive kshift+binop lowering, which is probably | |||
| 5014 | // conservative in most cases. | |||
| 5015 | static const CostTblEntry AVX512BoolReduction[] = { | |||
| 5016 | { ISD::AND, MVT::v2i1, 3 }, | |||
| 5017 | { ISD::AND, MVT::v4i1, 5 }, | |||
| 5018 | { ISD::AND, MVT::v8i1, 7 }, | |||
| 5019 | { ISD::AND, MVT::v16i1, 9 }, | |||
| 5020 | { ISD::AND, MVT::v32i1, 11 }, | |||
| 5021 | { ISD::AND, MVT::v64i1, 13 }, | |||
| 5022 | { ISD::OR, MVT::v2i1, 3 }, | |||
| 5023 | { ISD::OR, MVT::v4i1, 5 }, | |||
| 5024 | { ISD::OR, MVT::v8i1, 7 }, | |||
| 5025 | { ISD::OR, MVT::v16i1, 9 }, | |||
| 5026 | { ISD::OR, MVT::v32i1, 11 }, | |||
| 5027 | { ISD::OR, MVT::v64i1, 13 }, | |||
| 5028 | }; | |||
| 5029 | ||||
| 5030 | static const CostTblEntry AVX2BoolReduction[] = { | |||
| 5031 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
| 5032 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
| 5033 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp | |||
| 5034 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp | |||
| 5035 | }; | |||
| 5036 | ||||
| 5037 | static const CostTblEntry AVX1BoolReduction[] = { | |||
| 5038 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
| 5039 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
| 5040 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
| 5041 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp | |||
| 5042 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp | |||
| 5043 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp | |||
| 5044 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
| 5045 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp | |||
| 5046 | }; | |||
| 5047 | ||||
| 5048 | static const CostTblEntry SSE2BoolReduction[] = { | |||
| 5049 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp | |||
| 5050 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp | |||
| 5051 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
| 5052 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
| 5053 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp | |||
| 5054 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp | |||
| 5055 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp | |||
| 5056 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp | |||
| 5057 | }; | |||
| 5058 | ||||
| 5059 | // Handle bool allof/anyof patterns. | |||
| 5060 | if (ValVTy->getElementType()->isIntegerTy(1)) { | |||
| 5061 | InstructionCost ArithmeticCost = 0; | |||
| 5062 | if (LT.first != 1 && MTy.isVector() && | |||
| 5063 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
| 5064 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
| 5065 | auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(), | |||
| 5066 | MTy.getVectorNumElements()); | |||
| 5067 | ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind); | |||
| 5068 | ArithmeticCost *= LT.first - 1; | |||
| 5069 | } | |||
| 5070 | ||||
| 5071 | if (ST->hasAVX512()) | |||
| 5072 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) | |||
| 5073 | return ArithmeticCost + Entry->Cost; | |||
| 5074 | if (ST->hasAVX2()) | |||
| 5075 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) | |||
| 5076 | return ArithmeticCost + Entry->Cost; | |||
| 5077 | if (ST->hasAVX()) | |||
| 5078 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) | |||
| 5079 | return ArithmeticCost + Entry->Cost; | |||
| 5080 | if (ST->hasSSE2()) | |||
| 5081 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) | |||
| 5082 | return ArithmeticCost + Entry->Cost; | |||
| 5083 | ||||
| 5084 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | |||
| 5085 | } | |||
| 5086 | ||||
| 5087 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
| 5088 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); | |||
| 5089 | ||||
| 5090 | // Special case power of 2 reductions where the scalar type isn't changed | |||
| 5091 | // by type legalization. | |||
| 5092 | if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) | |||
| 5093 | return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); | |||
| 5094 | ||||
| 5095 | InstructionCost ReductionCost = 0; | |||
| 5096 | ||||
| 5097 | auto *Ty = ValVTy; | |||
| 5098 | if (LT.first != 1 && MTy.isVector() && | |||
| 5099 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
| 5100 | // Type needs to be split. We need LT.first - 1 arithmetic ops. | |||
| 5101 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
| 5102 | MTy.getVectorNumElements()); | |||
| 5103 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
| 5104 | ReductionCost *= LT.first - 1; | |||
| 5105 | NumVecElts = MTy.getVectorNumElements(); | |||
| 5106 | } | |||
| 5107 | ||||
| 5108 | // Now handle reduction with the legal type, taking into account size changes | |||
| 5109 | // at each level. | |||
| 5110 | while (NumVecElts > 1) { | |||
| 5111 | // Determine the size of the remaining vector we need to reduce. | |||
| 5112 | unsigned Size = NumVecElts * ScalarSize; | |||
| 5113 | NumVecElts /= 2; | |||
| 5114 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
| 5115 | if (Size > 128) { | |||
| 5116 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
| 5117 | ReductionCost += | |||
| 5118 | getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, | |||
| 5119 | NumVecElts, SubTy); | |||
| 5120 | Ty = SubTy; | |||
| 5121 | } else if (Size == 128) { | |||
| 5122 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
| 5123 | FixedVectorType *ShufTy; | |||
| 5124 | if (ValVTy->isFloatingPointTy()) | |||
| 5125 | ShufTy = | |||
| 5126 | FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2); | |||
| 5127 | else | |||
| 5128 | ShufTy = | |||
| 5129 | FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); | |||
| 5130 | ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
| 5131 | std::nullopt, CostKind, 0, nullptr); | |||
| 5132 | } else if (Size == 64) { | |||
| 5133 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
| 5134 | FixedVectorType *ShufTy; | |||
| 5135 | if (ValVTy->isFloatingPointTy()) | |||
| 5136 | ShufTy = | |||
| 5137 | FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4); | |||
| 5138 | else | |||
| 5139 | ShufTy = | |||
| 5140 | FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); | |||
| 5141 | ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
| 5142 | std::nullopt, CostKind, 0, nullptr); | |||
| 5143 | } else { | |||
| 5144 | // Reducing from smaller size is a shift by immediate. | |||
| 5145 | auto *ShiftTy = FixedVectorType::get( | |||
| 5146 | Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size); | |||
| 5147 | ReductionCost += getArithmeticInstrCost( | |||
| 5148 | Instruction::LShr, ShiftTy, CostKind, | |||
| 5149 | {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, | |||
| 5150 | {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); | |||
| 5151 | } | |||
| 5152 | ||||
| 5153 | // Add the arithmetic op for this level. | |||
| 5154 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); | |||
| 5155 | } | |||
| 5156 | ||||
| 5157 | // Add the final extract element to the cost. | |||
| 5158 | return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
| 5159 | } | |||
| 5160 | ||||
| 5161 | InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, | |||
| 5162 | bool IsUnsigned) { | |||
| 5163 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); | |||
| 5164 | ||||
| 5165 | MVT MTy = LT.second; | |||
| 5166 | ||||
| 5167 | int ISD; | |||
| 5168 | if (Ty->isIntOrIntVectorTy()) { | |||
| 5169 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
| 5170 | } else { | |||
| 5171 | assert(Ty->isFPOrFPVectorTy() &&(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5172, __extension__ __PRETTY_FUNCTION__)) | |||
| 5172 | "Expected float point or integer vector type.")(static_cast <bool> (Ty->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("Ty->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5172, __extension__ __PRETTY_FUNCTION__)); | |||
| 5173 | ISD = ISD::FMINNUM; | |||
| 5174 | } | |||
| 5175 | ||||
| 5176 | static const CostTblEntry SSE1CostTbl[] = { | |||
| 5177 | {ISD::FMINNUM, MVT::v4f32, 1}, | |||
| 5178 | }; | |||
| 5179 | ||||
| 5180 | static const CostTblEntry SSE2CostTbl[] = { | |||
| 5181 | {ISD::FMINNUM, MVT::v2f64, 1}, | |||
| 5182 | {ISD::SMIN, MVT::v8i16, 1}, | |||
| 5183 | {ISD::UMIN, MVT::v16i8, 1}, | |||
| 5184 | }; | |||
| 5185 | ||||
| 5186 | static const CostTblEntry SSE41CostTbl[] = { | |||
| 5187 | {ISD::SMIN, MVT::v4i32, 1}, | |||
| 5188 | {ISD::UMIN, MVT::v4i32, 1}, | |||
| 5189 | {ISD::UMIN, MVT::v8i16, 1}, | |||
| 5190 | {ISD::SMIN, MVT::v16i8, 1}, | |||
| 5191 | }; | |||
| 5192 | ||||
| 5193 | static const CostTblEntry SSE42CostTbl[] = { | |||
| 5194 | {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd | |||
| 5195 | }; | |||
| 5196 | ||||
| 5197 | static const CostTblEntry AVX1CostTbl[] = { | |||
| 5198 | {ISD::FMINNUM, MVT::v8f32, 1}, | |||
| 5199 | {ISD::FMINNUM, MVT::v4f64, 1}, | |||
| 5200 | {ISD::SMIN, MVT::v8i32, 3}, | |||
| 5201 | {ISD::UMIN, MVT::v8i32, 3}, | |||
| 5202 | {ISD::SMIN, MVT::v16i16, 3}, | |||
| 5203 | {ISD::UMIN, MVT::v16i16, 3}, | |||
| 5204 | {ISD::SMIN, MVT::v32i8, 3}, | |||
| 5205 | {ISD::UMIN, MVT::v32i8, 3}, | |||
| 5206 | }; | |||
| 5207 | ||||
| 5208 | static const CostTblEntry AVX2CostTbl[] = { | |||
| 5209 | {ISD::SMIN, MVT::v8i32, 1}, | |||
| 5210 | {ISD::UMIN, MVT::v8i32, 1}, | |||
| 5211 | {ISD::SMIN, MVT::v16i16, 1}, | |||
| 5212 | {ISD::UMIN, MVT::v16i16, 1}, | |||
| 5213 | {ISD::SMIN, MVT::v32i8, 1}, | |||
| 5214 | {ISD::UMIN, MVT::v32i8, 1}, | |||
| 5215 | }; | |||
| 5216 | ||||
| 5217 | static const CostTblEntry AVX512CostTbl[] = { | |||
| 5218 | {ISD::FMINNUM, MVT::v16f32, 1}, | |||
| 5219 | {ISD::FMINNUM, MVT::v8f64, 1}, | |||
| 5220 | {ISD::SMIN, MVT::v2i64, 1}, | |||
| 5221 | {ISD::UMIN, MVT::v2i64, 1}, | |||
| 5222 | {ISD::SMIN, MVT::v4i64, 1}, | |||
| 5223 | {ISD::UMIN, MVT::v4i64, 1}, | |||
| 5224 | {ISD::SMIN, MVT::v8i64, 1}, | |||
| 5225 | {ISD::UMIN, MVT::v8i64, 1}, | |||
| 5226 | {ISD::SMIN, MVT::v16i32, 1}, | |||
| 5227 | {ISD::UMIN, MVT::v16i32, 1}, | |||
| 5228 | }; | |||
| 5229 | ||||
| 5230 | static const CostTblEntry AVX512BWCostTbl[] = { | |||
| 5231 | {ISD::SMIN, MVT::v32i16, 1}, | |||
| 5232 | {ISD::UMIN, MVT::v32i16, 1}, | |||
| 5233 | {ISD::SMIN, MVT::v64i8, 1}, | |||
| 5234 | {ISD::UMIN, MVT::v64i8, 1}, | |||
| 5235 | }; | |||
| 5236 | ||||
| 5237 | // If we have a native MIN/MAX instruction for this type, use it. | |||
| 5238 | if (ST->hasBWI()) | |||
| 5239 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) | |||
| 5240 | return LT.first * Entry->Cost; | |||
| 5241 | ||||
| 5242 | if (ST->hasAVX512()) | |||
| 5243 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) | |||
| 5244 | return LT.first * Entry->Cost; | |||
| 5245 | ||||
| 5246 | if (ST->hasAVX2()) | |||
| 5247 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) | |||
| 5248 | return LT.first * Entry->Cost; | |||
| 5249 | ||||
| 5250 | if (ST->hasAVX()) | |||
| 5251 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) | |||
| 5252 | return LT.first * Entry->Cost; | |||
| 5253 | ||||
| 5254 | if (ST->hasSSE42()) | |||
| 5255 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) | |||
| 5256 | return LT.first * Entry->Cost; | |||
| 5257 | ||||
| 5258 | if (ST->hasSSE41()) | |||
| 5259 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) | |||
| 5260 | return LT.first * Entry->Cost; | |||
| 5261 | ||||
| 5262 | if (ST->hasSSE2()) | |||
| 5263 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) | |||
| 5264 | return LT.first * Entry->Cost; | |||
| 5265 | ||||
| 5266 | if (ST->hasSSE1()) | |||
| 5267 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) | |||
| 5268 | return LT.first * Entry->Cost; | |||
| 5269 | ||||
| 5270 | unsigned CmpOpcode; | |||
| 5271 | if (Ty->isFPOrFPVectorTy()) { | |||
| 5272 | CmpOpcode = Instruction::FCmp; | |||
| 5273 | } else { | |||
| 5274 | assert(Ty->isIntOrIntVectorTy() &&(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5275, __extension__ __PRETTY_FUNCTION__)) | |||
| 5275 | "expecting floating point or integer type for min/max reduction")(static_cast <bool> (Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction" ) ? void (0) : __assert_fail ("Ty->isIntOrIntVectorTy() && \"expecting floating point or integer type for min/max reduction\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5275, __extension__ __PRETTY_FUNCTION__)); | |||
| 5276 | CmpOpcode = Instruction::ICmp; | |||
| 5277 | } | |||
| 5278 | ||||
| 5279 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
| 5280 | // Otherwise fall back to cmp+select. | |||
| 5281 | InstructionCost Result = | |||
| 5282 | getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, | |||
| 5283 | CostKind) + | |||
| 5284 | getCmpSelInstrCost(Instruction::Select, Ty, CondTy, | |||
| 5285 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
| 5286 | return Result; | |||
| 5287 | } | |||
| 5288 | ||||
| 5289 | InstructionCost | |||
| 5290 | X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, | |||
| 5291 | bool IsUnsigned, | |||
| 5292 | TTI::TargetCostKind CostKind) { | |||
| 5293 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); | |||
| 5294 | ||||
| 5295 | MVT MTy = LT.second; | |||
| 5296 | ||||
| 5297 | int ISD; | |||
| 5298 | if (ValTy->isIntOrIntVectorTy()) { | |||
| 5299 | ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; | |||
| ||||
| 5300 | } else { | |||
| 5301 | assert(ValTy->isFPOrFPVectorTy() &&(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5302, __extension__ __PRETTY_FUNCTION__)) | |||
| 5302 | "Expected float point or integer vector type.")(static_cast <bool> (ValTy->isFPOrFPVectorTy() && "Expected float point or integer vector type.") ? void (0) : __assert_fail ("ValTy->isFPOrFPVectorTy() && \"Expected float point or integer vector type.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5302, __extension__ __PRETTY_FUNCTION__)); | |||
| 5303 | ISD = ISD::FMINNUM; | |||
| 5304 | } | |||
| 5305 | ||||
| 5306 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput | |||
| 5307 | // and make it as the cost. | |||
| 5308 | ||||
| 5309 | static const CostTblEntry SSE2CostTblNoPairWise[] = { | |||
| 5310 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw | |||
| 5311 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw | |||
| 5312 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw | |||
| 5313 | }; | |||
| 5314 | ||||
| 5315 | static const CostTblEntry SSE41CostTblNoPairWise[] = { | |||
| 5316 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 | |||
| 5317 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 | |||
| 5318 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 | |||
| 5319 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 | |||
| 5320 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor | |||
| 5321 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax | |||
| 5322 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb | |||
| 5323 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb | |||
| 5324 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb | |||
| 5325 | {ISD::SMIN, MVT::v16i8, 6}, | |||
| 5326 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 | |||
| 5327 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 | |||
| 5328 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 | |||
| 5329 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax | |||
| 5330 | }; | |||
| 5331 | ||||
| 5332 | static const CostTblEntry AVX1CostTblNoPairWise[] = { | |||
| 5333 | {ISD::SMIN, MVT::v16i16, 6}, | |||
| 5334 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax | |||
| 5335 | {ISD::SMIN, MVT::v32i8, 8}, | |||
| 5336 | {ISD::UMIN, MVT::v32i8, 8}, | |||
| 5337 | }; | |||
| 5338 | ||||
| 5339 | static const CostTblEntry AVX512BWCostTblNoPairWise[] = { | |||
| 5340 | {ISD::SMIN, MVT::v32i16, 8}, | |||
| 5341 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax | |||
| 5342 | {ISD::SMIN, MVT::v64i8, 10}, | |||
| 5343 | {ISD::UMIN, MVT::v64i8, 10}, | |||
| 5344 | }; | |||
| 5345 | ||||
| 5346 | // Before legalizing the type, give a chance to look up illegal narrow types | |||
| 5347 | // in the table. | |||
| 5348 | // FIXME: Is there a better way to do this? | |||
| 5349 | EVT VT = TLI->getValueType(DL, ValTy); | |||
| 5350 | if (VT.isSimple()) { | |||
| 5351 | MVT MTy = VT.getSimpleVT(); | |||
| 5352 | if (ST->hasBWI()) | |||
| 5353 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
| 5354 | return Entry->Cost; | |||
| 5355 | ||||
| 5356 | if (ST->hasAVX()) | |||
| 5357 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
| 5358 | return Entry->Cost; | |||
| 5359 | ||||
| 5360 | if (ST->hasSSE41()) | |||
| 5361 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
| 5362 | return Entry->Cost; | |||
| 5363 | ||||
| 5364 | if (ST->hasSSE2()) | |||
| 5365 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
| 5366 | return Entry->Cost; | |||
| 5367 | } | |||
| 5368 | ||||
| 5369 | auto *ValVTy = cast<FixedVectorType>(ValTy); | |||
| 5370 | unsigned NumVecElts = ValVTy->getNumElements(); | |||
| 5371 | ||||
| 5372 | auto *Ty = ValVTy; | |||
| 5373 | InstructionCost MinMaxCost = 0; | |||
| 5374 | if (LT.first != 1 && MTy.isVector() && | |||
| 5375 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { | |||
| 5376 | // Type needs to be split. We need LT.first - 1 operations ops. | |||
| 5377 | Ty = FixedVectorType::get(ValVTy->getElementType(), | |||
| 5378 | MTy.getVectorNumElements()); | |||
| 5379 | auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(), | |||
| 5380 | MTy.getVectorNumElements()); | |||
| 5381 | MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
| 5382 | MinMaxCost *= LT.first - 1; | |||
| 5383 | NumVecElts = MTy.getVectorNumElements(); | |||
| 5384 | } | |||
| 5385 | ||||
| 5386 | if (ST->hasBWI()) | |||
| 5387 | if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy)) | |||
| 5388 | return MinMaxCost + Entry->Cost; | |||
| 5389 | ||||
| 5390 | if (ST->hasAVX()) | |||
| 5391 | if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) | |||
| 5392 | return MinMaxCost + Entry->Cost; | |||
| 5393 | ||||
| 5394 | if (ST->hasSSE41()) | |||
| 5395 | if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy)) | |||
| 5396 | return MinMaxCost + Entry->Cost; | |||
| 5397 | ||||
| 5398 | if (ST->hasSSE2()) | |||
| 5399 | if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) | |||
| 5400 | return MinMaxCost + Entry->Cost; | |||
| 5401 | ||||
| 5402 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); | |||
| 5403 | ||||
| 5404 | // Special case power of 2 reductions where the scalar type isn't changed | |||
| 5405 | // by type legalization. | |||
| 5406 | if (!isPowerOf2_32(ValVTy->getNumElements()) || | |||
| 5407 | ScalarSize != MTy.getScalarSizeInBits()) | |||
| 5408 | return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); | |||
| 5409 | ||||
| 5410 | // Now handle reduction with the legal type, taking into account size changes | |||
| 5411 | // at each level. | |||
| 5412 | while (NumVecElts > 1) { | |||
| 5413 | // Determine the size of the remaining vector we need to reduce. | |||
| 5414 | unsigned Size = NumVecElts * ScalarSize; | |||
| 5415 | NumVecElts /= 2; | |||
| 5416 | // If we're reducing from 256/512 bits, use an extract_subvector. | |||
| 5417 | if (Size > 128) { | |||
| 5418 | auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); | |||
| 5419 | MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, | |||
| 5420 | CostKind, NumVecElts, SubTy); | |||
| 5421 | Ty = SubTy; | |||
| 5422 | } else if (Size == 128) { | |||
| 5423 | // Reducing from 128 bits is a permute of v2f64/v2i64. | |||
| 5424 | VectorType *ShufTy; | |||
| 5425 | if (ValTy->isFloatingPointTy()) | |||
| 5426 | ShufTy = | |||
| 5427 | FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); | |||
| 5428 | else | |||
| 5429 | ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); | |||
| 5430 | MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
| 5431 | std::nullopt, CostKind, 0, nullptr); | |||
| 5432 | } else if (Size == 64) { | |||
| 5433 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. | |||
| 5434 | FixedVectorType *ShufTy; | |||
| 5435 | if (ValTy->isFloatingPointTy()) | |||
| 5436 | ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); | |||
| 5437 | else | |||
| 5438 | ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); | |||
| 5439 | MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, | |||
| 5440 | std::nullopt, CostKind, 0, nullptr); | |||
| 5441 | } else { | |||
| 5442 | // Reducing from smaller size is a shift by immediate. | |||
| 5443 | auto *ShiftTy = FixedVectorType::get( | |||
| 5444 | Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); | |||
| 5445 | MinMaxCost += getArithmeticInstrCost( | |||
| 5446 | Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput, | |||
| 5447 | {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, | |||
| 5448 | {TargetTransformInfo::OK_UniformConstantValue, TargetTransformInfo::OP_None}); | |||
| 5449 | } | |||
| 5450 | ||||
| 5451 | // Add the arithmetic op for this level. | |||
| 5452 | auto *SubCondTy = | |||
| 5453 | FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements()); | |||
| 5454 | MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned); | |||
| 5455 | } | |||
| 5456 | ||||
| 5457 | // Add the final extract element to the cost. | |||
| 5458 | return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); | |||
| 5459 | } | |||
| 5460 | ||||
| 5461 | /// Calculate the cost of materializing a 64-bit value. This helper | |||
| 5462 | /// method might only calculate a fraction of a larger immediate. Therefore it | |||
| 5463 | /// is valid to return a cost of ZERO. | |||
| 5464 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { | |||
| 5465 | if (Val == 0) | |||
| 5466 | return TTI::TCC_Free; | |||
| 5467 | ||||
| 5468 | if (isInt<32>(Val)) | |||
| 5469 | return TTI::TCC_Basic; | |||
| 5470 | ||||
| 5471 | return 2 * TTI::TCC_Basic; | |||
| 5472 | } | |||
| 5473 | ||||
| 5474 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, | |||
| 5475 | TTI::TargetCostKind CostKind) { | |||
| 5476 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5476, __extension__ __PRETTY_FUNCTION__)); | |||
| 5477 | ||||
| 5478 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
| 5479 | if (BitSize == 0) | |||
| 5480 | return ~0U; | |||
| 5481 | ||||
| 5482 | // Never hoist constants larger than 128bit, because this might lead to | |||
| 5483 | // incorrect code generation or assertions in codegen. | |||
| 5484 | // Fixme: Create a cost model for types larger than i128 once the codegen | |||
| 5485 | // issues have been fixed. | |||
| 5486 | if (BitSize > 128) | |||
| 5487 | return TTI::TCC_Free; | |||
| 5488 | ||||
| 5489 | if (Imm == 0) | |||
| 5490 | return TTI::TCC_Free; | |||
| 5491 | ||||
| 5492 | // Sign-extend all constants to a multiple of 64-bit. | |||
| 5493 | APInt ImmVal = Imm; | |||
| 5494 | if (BitSize % 64 != 0) | |||
| 5495 | ImmVal = Imm.sext(alignTo(BitSize, 64)); | |||
| 5496 | ||||
| 5497 | // Split the constant into 64-bit chunks and calculate the cost for each | |||
| 5498 | // chunk. | |||
| 5499 | InstructionCost Cost = 0; | |||
| 5500 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { | |||
| 5501 | APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); | |||
| 5502 | int64_t Val = Tmp.getSExtValue(); | |||
| 5503 | Cost += getIntImmCost(Val); | |||
| 5504 | } | |||
| 5505 | // We need at least one instruction to materialize the constant. | |||
| 5506 | return std::max<InstructionCost>(1, Cost); | |||
| 5507 | } | |||
| 5508 | ||||
| 5509 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, | |||
| 5510 | const APInt &Imm, Type *Ty, | |||
| 5511 | TTI::TargetCostKind CostKind, | |||
| 5512 | Instruction *Inst) { | |||
| 5513 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5513, __extension__ __PRETTY_FUNCTION__)); | |||
| 5514 | ||||
| 5515 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
| 5516 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
| 5517 | // here, so that constant hoisting will ignore this constant. | |||
| 5518 | if (BitSize == 0) | |||
| 5519 | return TTI::TCC_Free; | |||
| 5520 | ||||
| 5521 | unsigned ImmIdx = ~0U; | |||
| 5522 | switch (Opcode) { | |||
| 5523 | default: | |||
| 5524 | return TTI::TCC_Free; | |||
| 5525 | case Instruction::GetElementPtr: | |||
| 5526 | // Always hoist the base address of a GetElementPtr. This prevents the | |||
| 5527 | // creation of new constants for every base constant that gets constant | |||
| 5528 | // folded with the offset. | |||
| 5529 | if (Idx == 0) | |||
| 5530 | return 2 * TTI::TCC_Basic; | |||
| 5531 | return TTI::TCC_Free; | |||
| 5532 | case Instruction::Store: | |||
| 5533 | ImmIdx = 0; | |||
| 5534 | break; | |||
| 5535 | case Instruction::ICmp: | |||
| 5536 | // This is an imperfect hack to prevent constant hoisting of | |||
| 5537 | // compares that might be trying to check if a 64-bit value fits in | |||
| 5538 | // 32-bits. The backend can optimize these cases using a right shift by 32. | |||
| 5539 | // Ideally we would check the compare predicate here. There also other | |||
| 5540 | // similar immediates the backend can use shifts for. | |||
| 5541 | if (Idx == 1 && Imm.getBitWidth() == 64) { | |||
| 5542 | uint64_t ImmVal = Imm.getZExtValue(); | |||
| 5543 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) | |||
| 5544 | return TTI::TCC_Free; | |||
| 5545 | } | |||
| 5546 | ImmIdx = 1; | |||
| 5547 | break; | |||
| 5548 | case Instruction::And: | |||
| 5549 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes | |||
| 5550 | // by using a 32-bit operation with implicit zero extension. Detect such | |||
| 5551 | // immediates here as the normal path expects bit 31 to be sign extended. | |||
| 5552 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(32)) | |||
| 5553 | return TTI::TCC_Free; | |||
| 5554 | ImmIdx = 1; | |||
| 5555 | break; | |||
| 5556 | case Instruction::Add: | |||
| 5557 | case Instruction::Sub: | |||
| 5558 | // For add/sub, we can use the opposite instruction for INT32_MIN. | |||
| 5559 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) | |||
| 5560 | return TTI::TCC_Free; | |||
| 5561 | ImmIdx = 1; | |||
| 5562 | break; | |||
| 5563 | case Instruction::UDiv: | |||
| 5564 | case Instruction::SDiv: | |||
| 5565 | case Instruction::URem: | |||
| 5566 | case Instruction::SRem: | |||
| 5567 | // Division by constant is typically expanded later into a different | |||
| 5568 | // instruction sequence. This completely changes the constants. | |||
| 5569 | // Report them as "free" to stop ConstantHoist from marking them as opaque. | |||
| 5570 | return TTI::TCC_Free; | |||
| 5571 | case Instruction::Mul: | |||
| 5572 | case Instruction::Or: | |||
| 5573 | case Instruction::Xor: | |||
| 5574 | ImmIdx = 1; | |||
| 5575 | break; | |||
| 5576 | // Always return TCC_Free for the shift value of a shift instruction. | |||
| 5577 | case Instruction::Shl: | |||
| 5578 | case Instruction::LShr: | |||
| 5579 | case Instruction::AShr: | |||
| 5580 | if (Idx == 1) | |||
| 5581 | return TTI::TCC_Free; | |||
| 5582 | break; | |||
| 5583 | case Instruction::Trunc: | |||
| 5584 | case Instruction::ZExt: | |||
| 5585 | case Instruction::SExt: | |||
| 5586 | case Instruction::IntToPtr: | |||
| 5587 | case Instruction::PtrToInt: | |||
| 5588 | case Instruction::BitCast: | |||
| 5589 | case Instruction::PHI: | |||
| 5590 | case Instruction::Call: | |||
| 5591 | case Instruction::Select: | |||
| 5592 | case Instruction::Ret: | |||
| 5593 | case Instruction::Load: | |||
| 5594 | break; | |||
| 5595 | } | |||
| 5596 | ||||
| 5597 | if (Idx == ImmIdx) { | |||
| 5598 | int NumConstants = divideCeil(BitSize, 64); | |||
| 5599 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 5600 | return (Cost <= NumConstants * TTI::TCC_Basic) | |||
| 5601 | ? static_cast<int>(TTI::TCC_Free) | |||
| 5602 | : Cost; | |||
| 5603 | } | |||
| 5604 | ||||
| 5605 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 5606 | } | |||
| 5607 | ||||
| 5608 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, | |||
| 5609 | const APInt &Imm, Type *Ty, | |||
| 5610 | TTI::TargetCostKind CostKind) { | |||
| 5611 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86TargetTransformInfo.cpp" , 5611, __extension__ __PRETTY_FUNCTION__)); | |||
| 5612 | ||||
| 5613 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
| 5614 | // There is no cost model for constants with a bit size of 0. Return TCC_Free | |||
| 5615 | // here, so that constant hoisting will ignore this constant. | |||
| 5616 | if (BitSize == 0) | |||
| 5617 | return TTI::TCC_Free; | |||
| 5618 | ||||
| 5619 | switch (IID) { | |||
| 5620 | default: | |||
| 5621 | return TTI::TCC_Free; | |||
| 5622 | case Intrinsic::sadd_with_overflow: | |||
| 5623 | case Intrinsic::uadd_with_overflow: | |||
| 5624 | case Intrinsic::ssub_with_overflow: | |||
| 5625 | case Intrinsic::usub_with_overflow: | |||
| 5626 | case Intrinsic::smul_with_overflow: | |||
| 5627 | case Intrinsic::umul_with_overflow: | |||
| 5628 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(32)) | |||
| 5629 | return TTI::TCC_Free; | |||
| 5630 | break; | |||
| 5631 | case Intrinsic::experimental_stackmap: | |||
| 5632 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) | |||
| 5633 | return TTI::TCC_Free; | |||
| 5634 | break; | |||
| 5635 | case Intrinsic::experimental_patchpoint_void: | |||
| 5636 | case Intrinsic::experimental_patchpoint_i64: | |||
| 5637 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(64))) | |||
| 5638 | return TTI::TCC_Free; | |||
| 5639 | break; | |||
| 5640 | } | |||
| 5641 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); | |||
| 5642 | } | |||
| 5643 | ||||
| 5644 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, | |||
| 5645 | TTI::TargetCostKind CostKind, | |||
| 5646 | const Instruction *I) { | |||
| 5647 | if (CostKind != TTI::TCK_RecipThroughput) | |||
| 5648 | return Opcode == Instruction::PHI ? 0 : 1; | |||
| 5649 | // Branches are assumed to be predicted. | |||
| 5650 | return 0; | |||
| 5651 | } | |||
| 5652 | ||||
| 5653 | int X86TTIImpl::getGatherOverhead() const { | |||
| 5654 | // Some CPUs have more overhead for gather. The specified overhead is relative | |||
| 5655 | // to the Load operation. "2" is the number provided by Intel architects. This | |||
| 5656 | // parameter is used for cost estimation of Gather Op and comparison with | |||
| 5657 | // other alternatives. | |||
| 5658 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only | |||
| 5659 | // enable gather with a -march. | |||
| 5660 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) | |||
| 5661 | return 2; | |||
| 5662 | ||||
| 5663 | return 1024; | |||
| 5664 | } | |||
| 5665 | ||||
| 5666 | int X86TTIImpl::getScatterOverhead() const { | |||
| 5667 | if (ST->hasAVX512()) | |||
| 5668 | return 2; | |||
| 5669 | ||||
| 5670 | return 1024; | |||
| 5671 | } | |||
| 5672 | ||||
| 5673 | // Return an average cost of Gather / Scatter instruction, maybe improved later. | |||
| 5674 | // FIXME: Add TargetCostKind support. | |||
| 5675 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, | |||
| 5676 | const Value *Ptr, Align Alignment, | |||
| 5677 | unsigned AddressSpace) { | |||
| 5678 | ||||
| 5679 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost")(static_cast <bool> (isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost") ? void (0) : __assert_fail ("isa<VectorType>(SrcVTy) && \"Unexpected type in getGSVectorCost\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5679, __extension__ __PRETTY_FUNCTION__)); | |||
| 5680 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | |||
| 5681 | ||||
| 5682 | // Try to reduce index size from 64 bit (default for GEP) | |||
| 5683 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the | |||
| 5684 | // operation will use 16 x 64 indices which do not fit in a zmm and needs | |||
| 5685 | // to split. Also check that the base pointer is the same for all lanes, | |||
| 5686 | // and that there's at most one variable index. | |||
| 5687 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { | |||
| 5688 | unsigned IndexSize = DL.getPointerSizeInBits(); | |||
| 5689 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); | |||
| 5690 | if (IndexSize < 64 || !GEP) | |||
| 5691 | return IndexSize; | |||
| 5692 | ||||
| 5693 | unsigned NumOfVarIndices = 0; | |||
| 5694 | const Value *Ptrs = GEP->getPointerOperand(); | |||
| 5695 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) | |||
| 5696 | return IndexSize; | |||
| 5697 | for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { | |||
| 5698 | if (isa<Constant>(GEP->getOperand(i))) | |||
| 5699 | continue; | |||
| 5700 | Type *IndxTy = GEP->getOperand(i)->getType(); | |||
| 5701 | if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy)) | |||
| 5702 | IndxTy = IndexVTy->getElementType(); | |||
| 5703 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && | |||
| 5704 | !isa<SExtInst>(GEP->getOperand(i))) || | |||
| 5705 | ++NumOfVarIndices > 1) | |||
| 5706 | return IndexSize; // 64 | |||
| 5707 | } | |||
| 5708 | return (unsigned)32; | |||
| 5709 | }; | |||
| 5710 | ||||
| 5711 | // Trying to reduce IndexSize to 32 bits for vector 16. | |||
| 5712 | // By default the IndexSize is equal to pointer size. | |||
| 5713 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) | |||
| 5714 | ? getIndexSizeInBits(Ptr, DL) | |||
| 5715 | : DL.getPointerSizeInBits(); | |||
| 5716 | ||||
| 5717 | auto *IndexVTy = FixedVectorType::get( | |||
| 5718 | IntegerType::get(SrcVTy->getContext(), IndexSize), VF); | |||
| 5719 | std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(IndexVTy); | |||
| 5720 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(SrcVTy); | |||
| 5721 | InstructionCost::CostType SplitFactor = | |||
| 5722 | *std::max(IdxsLT.first, SrcLT.first).getValue(); | |||
| 5723 | if (SplitFactor > 1) { | |||
| 5724 | // Handle splitting of vector of pointers | |||
| 5725 | auto *SplitSrcTy = | |||
| 5726 | FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); | |||
| 5727 | return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, | |||
| 5728 | AddressSpace); | |||
| 5729 | } | |||
| 5730 | ||||
| 5731 | // The gather / scatter cost is given by Intel architects. It is a rough | |||
| 5732 | // number since we are looking at one instruction in a time. | |||
| 5733 | const int GSOverhead = (Opcode == Instruction::Load) | |||
| 5734 | ? getGatherOverhead() | |||
| 5735 | : getScatterOverhead(); | |||
| 5736 | return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), | |||
| 5737 | MaybeAlign(Alignment), AddressSpace, | |||
| 5738 | TTI::TCK_RecipThroughput); | |||
| 5739 | } | |||
| 5740 | ||||
| 5741 | /// Return the cost of full scalarization of gather / scatter operation. | |||
| 5742 | /// | |||
| 5743 | /// Opcode - Load or Store instruction. | |||
| 5744 | /// SrcVTy - The type of the data vector that should be gathered or scattered. | |||
| 5745 | /// VariableMask - The mask is non-constant at compile time. | |||
| 5746 | /// Alignment - Alignment for one element. | |||
| 5747 | /// AddressSpace - pointer[s] address space. | |||
| 5748 | /// | |||
| 5749 | /// FIXME: Add TargetCostKind support. | |||
| 5750 | InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, | |||
| 5751 | bool VariableMask, Align Alignment, | |||
| 5752 | unsigned AddressSpace) { | |||
| 5753 | Type *ScalarTy = SrcVTy->getScalarType(); | |||
| 5754 | unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); | |||
| 5755 | APInt DemandedElts = APInt::getAllOnes(VF); | |||
| 5756 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; | |||
| 5757 | ||||
| 5758 | InstructionCost MaskUnpackCost = 0; | |||
| 5759 | if (VariableMask) { | |||
| 5760 | auto *MaskTy = | |||
| 5761 | FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); | |||
| 5762 | MaskUnpackCost = getScalarizationOverhead( | |||
| 5763 | MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true); | |||
| 5764 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( | |||
| 5765 | Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, | |||
| 5766 | CmpInst::BAD_ICMP_PREDICATE, CostKind); | |||
| 5767 | InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); | |||
| 5768 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); | |||
| 5769 | } | |||
| 5770 | ||||
| 5771 | InstructionCost AddressUnpackCost = getScalarizationOverhead( | |||
| 5772 | FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, | |||
| 5773 | /*Insert=*/false, /*Extract=*/true); | |||
| 5774 | ||||
| 5775 | // The cost of the scalar loads/stores. | |||
| 5776 | InstructionCost MemoryOpCost = | |||
| 5777 | VF * getMemoryOpCost(Opcode, ScalarTy, MaybeAlign(Alignment), | |||
| 5778 | AddressSpace, CostKind); | |||
| 5779 | ||||
| 5780 | // The cost of forming the vector from loaded scalars/ | |||
| 5781 | // scalarizing the vector to perform scalar stores. | |||
| 5782 | InstructionCost InsertExtractCost = | |||
| 5783 | getScalarizationOverhead(cast<FixedVectorType>(SrcVTy), DemandedElts, | |||
| 5784 | /*Insert=*/Opcode == Instruction::Load, | |||
| 5785 | /*Extract=*/Opcode == Instruction::Store); | |||
| 5786 | ||||
| 5787 | return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; | |||
| 5788 | } | |||
| 5789 | ||||
| 5790 | /// Calculate the cost of Gather / Scatter operation | |||
| 5791 | InstructionCost X86TTIImpl::getGatherScatterOpCost( | |||
| 5792 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, | |||
| 5793 | Align Alignment, TTI::TargetCostKind CostKind, | |||
| 5794 | const Instruction *I = nullptr) { | |||
| 5795 | if (CostKind != TTI::TCK_RecipThroughput) { | |||
| 5796 | if ((Opcode == Instruction::Load && | |||
| 5797 | isLegalMaskedGather(SrcVTy, Align(Alignment)) && | |||
| 5798 | !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | |||
| 5799 | Align(Alignment))) || | |||
| 5800 | (Opcode == Instruction::Store && | |||
| 5801 | isLegalMaskedScatter(SrcVTy, Align(Alignment)) && | |||
| 5802 | !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | |||
| 5803 | Align(Alignment)))) | |||
| 5804 | return 1; | |||
| 5805 | return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask, | |||
| 5806 | Alignment, CostKind, I); | |||
| 5807 | } | |||
| 5808 | ||||
| 5809 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter")(static_cast <bool> (SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter") ? void (0) : __assert_fail ("SrcVTy->isVectorTy() && \"Unexpected data type for Gather/Scatter\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5809, __extension__ __PRETTY_FUNCTION__)); | |||
| 5810 | PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); | |||
| 5811 | if (!PtrTy && Ptr->getType()->isVectorTy()) | |||
| 5812 | PtrTy = dyn_cast<PointerType>( | |||
| 5813 | cast<VectorType>(Ptr->getType())->getElementType()); | |||
| 5814 | assert(PtrTy && "Unexpected type for Ptr argument")(static_cast <bool> (PtrTy && "Unexpected type for Ptr argument" ) ? void (0) : __assert_fail ("PtrTy && \"Unexpected type for Ptr argument\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5814, __extension__ __PRETTY_FUNCTION__)); | |||
| 5815 | unsigned AddressSpace = PtrTy->getAddressSpace(); | |||
| 5816 | ||||
| 5817 | if ((Opcode == Instruction::Load && | |||
| 5818 | (!isLegalMaskedGather(SrcVTy, Align(Alignment)) || | |||
| 5819 | forceScalarizeMaskedGather(cast<VectorType>(SrcVTy), | |||
| 5820 | Align(Alignment)))) || | |||
| 5821 | (Opcode == Instruction::Store && | |||
| 5822 | (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) || | |||
| 5823 | forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy), | |||
| 5824 | Align(Alignment))))) | |||
| 5825 | return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, | |||
| 5826 | AddressSpace); | |||
| 5827 | ||||
| 5828 | return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); | |||
| 5829 | } | |||
| 5830 | ||||
| 5831 | bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, | |||
| 5832 | const TargetTransformInfo::LSRCost &C2) { | |||
| 5833 | // X86 specific here are "instruction number 1st priority". | |||
| 5834 | return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, | |||
| 5835 | C1.NumIVMuls, C1.NumBaseAdds, | |||
| 5836 | C1.ScaleCost, C1.ImmCost, C1.SetupCost) < | |||
| 5837 | std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, | |||
| 5838 | C2.NumIVMuls, C2.NumBaseAdds, | |||
| 5839 | C2.ScaleCost, C2.ImmCost, C2.SetupCost); | |||
| 5840 | } | |||
| 5841 | ||||
| 5842 | bool X86TTIImpl::canMacroFuseCmp() { | |||
| 5843 | return ST->hasMacroFusion() || ST->hasBranchFusion(); | |||
| 5844 | } | |||
| 5845 | ||||
| 5846 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { | |||
| 5847 | if (!ST->hasAVX()) | |||
| 5848 | return false; | |||
| 5849 | ||||
| 5850 | // The backend can't handle a single element vector. | |||
| 5851 | if (isa<VectorType>(DataTy) && | |||
| 5852 | cast<FixedVectorType>(DataTy)->getNumElements() == 1) | |||
| 5853 | return false; | |||
| 5854 | Type *ScalarTy = DataTy->getScalarType(); | |||
| 5855 | ||||
| 5856 | if (ScalarTy->isPointerTy()) | |||
| 5857 | return true; | |||
| 5858 | ||||
| 5859 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
| 5860 | return true; | |||
| 5861 | ||||
| 5862 | if (ScalarTy->isHalfTy() && ST->hasBWI()) | |||
| 5863 | return true; | |||
| 5864 | ||||
| 5865 | if (!ScalarTy->isIntegerTy()) | |||
| 5866 | return false; | |||
| 5867 | ||||
| 5868 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
| 5869 | return IntWidth == 32 || IntWidth == 64 || | |||
| 5870 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); | |||
| 5871 | } | |||
| 5872 | ||||
| 5873 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { | |||
| 5874 | return isLegalMaskedLoad(DataType, Alignment); | |||
| 5875 | } | |||
| 5876 | ||||
| 5877 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { | |||
| 5878 | unsigned DataSize = DL.getTypeStoreSize(DataType); | |||
| 5879 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 | |||
| 5880 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 | |||
| 5881 | // (the equivalent stores only require AVX). | |||
| 5882 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) | |||
| 5883 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); | |||
| 5884 | ||||
| 5885 | return false; | |||
| 5886 | } | |||
| 5887 | ||||
| 5888 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { | |||
| 5889 | unsigned DataSize = DL.getTypeStoreSize(DataType); | |||
| 5890 | ||||
| 5891 | // SSE4A supports nontemporal stores of float and double at arbitrary | |||
| 5892 | // alignment. | |||
| 5893 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) | |||
| 5894 | return true; | |||
| 5895 | ||||
| 5896 | // Besides the SSE4A subtarget exception above, only aligned stores are | |||
| 5897 | // available nontemporaly on any other subtarget. And only stores with a size | |||
| 5898 | // of 4..32 bytes (powers of 2, only) are permitted. | |||
| 5899 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || | |||
| 5900 | !isPowerOf2_32(DataSize)) | |||
| 5901 | return false; | |||
| 5902 | ||||
| 5903 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent | |||
| 5904 | // loads require AVX2). | |||
| 5905 | if (DataSize == 32) | |||
| 5906 | return ST->hasAVX(); | |||
| 5907 | if (DataSize == 16) | |||
| 5908 | return ST->hasSSE1(); | |||
| 5909 | return true; | |||
| 5910 | } | |||
| 5911 | ||||
| 5912 | bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, | |||
| 5913 | ElementCount NumElements) const { | |||
| 5914 | // movddup | |||
| 5915 | return ST->hasSSE3() && !NumElements.isScalable() && | |||
| 5916 | NumElements.getFixedValue() == 2 && | |||
| 5917 | ElementTy == Type::getDoubleTy(ElementTy->getContext()); | |||
| 5918 | } | |||
| 5919 | ||||
| 5920 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { | |||
| 5921 | if (!isa<VectorType>(DataTy)) | |||
| 5922 | return false; | |||
| 5923 | ||||
| 5924 | if (!ST->hasAVX512()) | |||
| 5925 | return false; | |||
| 5926 | ||||
| 5927 | // The backend can't handle a single element vector. | |||
| 5928 | if (cast<FixedVectorType>(DataTy)->getNumElements() == 1) | |||
| 5929 | return false; | |||
| 5930 | ||||
| 5931 | Type *ScalarTy = cast<VectorType>(DataTy)->getElementType(); | |||
| 5932 | ||||
| 5933 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
| 5934 | return true; | |||
| 5935 | ||||
| 5936 | if (!ScalarTy->isIntegerTy()) | |||
| 5937 | return false; | |||
| 5938 | ||||
| 5939 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
| 5940 | return IntWidth == 32 || IntWidth == 64 || | |||
| 5941 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); | |||
| 5942 | } | |||
| 5943 | ||||
| 5944 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { | |||
| 5945 | return isLegalMaskedExpandLoad(DataTy); | |||
| 5946 | } | |||
| 5947 | ||||
| 5948 | bool X86TTIImpl::supportsGather() const { | |||
| 5949 | // Some CPUs have better gather performance than others. | |||
| 5950 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only | |||
| 5951 | // enable gather with a -march. | |||
| 5952 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); | |||
| 5953 | } | |||
| 5954 | ||||
| 5955 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { | |||
| 5956 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX | |||
| 5957 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend | |||
| 5958 | // it to 8 elements, but zeroing upper bits of the mask vector will add more | |||
| 5959 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: | |||
| 5960 | // Check, maybe the gather/scatter instruction is better in the VariableMask | |||
| 5961 | // case. | |||
| 5962 | unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements(); | |||
| 5963 | return NumElts == 1 || | |||
| 5964 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); | |||
| 5965 | } | |||
| 5966 | ||||
| 5967 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { | |||
| 5968 | if (!supportsGather()) | |||
| 5969 | return false; | |||
| 5970 | Type *ScalarTy = DataTy->getScalarType(); | |||
| 5971 | if (ScalarTy->isPointerTy()) | |||
| 5972 | return true; | |||
| 5973 | ||||
| 5974 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) | |||
| 5975 | return true; | |||
| 5976 | ||||
| 5977 | if (!ScalarTy->isIntegerTy()) | |||
| 5978 | return false; | |||
| 5979 | ||||
| 5980 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); | |||
| 5981 | return IntWidth == 32 || IntWidth == 64; | |||
| 5982 | } | |||
| 5983 | ||||
| 5984 | bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, | |||
| 5985 | unsigned Opcode1, | |||
| 5986 | const SmallBitVector &OpcodeMask) const { | |||
| 5987 | // ADDSUBPS 4xf32 SSE3 | |||
| 5988 | // VADDSUBPS 4xf32 AVX | |||
| 5989 | // VADDSUBPS 8xf32 AVX2 | |||
| 5990 | // ADDSUBPD 2xf64 SSE3 | |||
| 5991 | // VADDSUBPD 2xf64 AVX | |||
| 5992 | // VADDSUBPD 4xf64 AVX2 | |||
| 5993 | ||||
| 5994 | unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements(); | |||
| 5995 | assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible")(static_cast <bool> (OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible") ? void (0) : __assert_fail ("OpcodeMask.size() == NumElements && \"Mask and VecTy are incompatible\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 5995, __extension__ __PRETTY_FUNCTION__)); | |||
| 5996 | if (!isPowerOf2_32(NumElements)) | |||
| 5997 | return false; | |||
| 5998 | // Check the opcode pattern. We apply the mask on the opcode arguments and | |||
| 5999 | // then check if it is what we expect. | |||
| 6000 | for (int Lane : seq<int>(0, NumElements)) { | |||
| 6001 | unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0; | |||
| 6002 | // We expect FSub for even lanes and FAdd for odd lanes. | |||
| 6003 | if (Lane % 2 == 0 && Opc != Instruction::FSub) | |||
| 6004 | return false; | |||
| 6005 | if (Lane % 2 == 1 && Opc != Instruction::FAdd) | |||
| 6006 | return false; | |||
| 6007 | } | |||
| 6008 | // Now check that the pattern is supported by the target ISA. | |||
| 6009 | Type *ElemTy = cast<VectorType>(VecTy)->getElementType(); | |||
| 6010 | if (ElemTy->isFloatTy()) | |||
| 6011 | return ST->hasSSE3() && NumElements % 4 == 0; | |||
| 6012 | if (ElemTy->isDoubleTy()) | |||
| 6013 | return ST->hasSSE3() && NumElements % 2 == 0; | |||
| 6014 | return false; | |||
| 6015 | } | |||
| 6016 | ||||
| 6017 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { | |||
| 6018 | // AVX2 doesn't support scatter | |||
| 6019 | if (!ST->hasAVX512()) | |||
| 6020 | return false; | |||
| 6021 | return isLegalMaskedGather(DataType, Alignment); | |||
| 6022 | } | |||
| 6023 | ||||
| 6024 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { | |||
| 6025 | EVT VT = TLI->getValueType(DL, DataType); | |||
| 6026 | return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); | |||
| 6027 | } | |||
| 6028 | ||||
| 6029 | bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) { | |||
| 6030 | // FDIV is always expensive, even if it has a very low uop count. | |||
| 6031 | // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? | |||
| 6032 | if (I->getOpcode() == Instruction::FDiv) | |||
| 6033 | return true; | |||
| 6034 | ||||
| 6035 | return BaseT::isExpensiveToSpeculativelyExecute(I); | |||
| 6036 | } | |||
| 6037 | ||||
| 6038 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { | |||
| 6039 | return false; | |||
| 6040 | } | |||
| 6041 | ||||
| 6042 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, | |||
| 6043 | const Function *Callee) const { | |||
| 6044 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
| 6045 | ||||
| 6046 | // Work this as a subsetting of subtarget features. | |||
| 6047 | const FeatureBitset &CallerBits = | |||
| 6048 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); | |||
| 6049 | const FeatureBitset &CalleeBits = | |||
| 6050 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); | |||
| 6051 | ||||
| 6052 | // Check whether features are the same (apart from the ignore list). | |||
| 6053 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; | |||
| 6054 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; | |||
| 6055 | if (RealCallerBits == RealCalleeBits) | |||
| 6056 | return true; | |||
| 6057 | ||||
| 6058 | // If the features are a subset, we need to additionally check for calls | |||
| 6059 | // that may become ABI-incompatible as a result of inlining. | |||
| 6060 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) | |||
| 6061 | return false; | |||
| 6062 | ||||
| 6063 | for (const Instruction &I : instructions(Callee)) { | |||
| 6064 | if (const auto *CB = dyn_cast<CallBase>(&I)) { | |||
| 6065 | SmallVector<Type *, 8> Types; | |||
| 6066 | for (Value *Arg : CB->args()) | |||
| 6067 | Types.push_back(Arg->getType()); | |||
| 6068 | if (!CB->getType()->isVoidTy()) | |||
| 6069 | Types.push_back(CB->getType()); | |||
| 6070 | ||||
| 6071 | // Simple types are always ABI compatible. | |||
| 6072 | auto IsSimpleTy = [](Type *Ty) { | |||
| 6073 | return !Ty->isVectorTy() && !Ty->isAggregateType(); | |||
| 6074 | }; | |||
| 6075 | if (all_of(Types, IsSimpleTy)) | |||
| 6076 | continue; | |||
| 6077 | ||||
| 6078 | if (Function *NestedCallee = CB->getCalledFunction()) { | |||
| 6079 | // Assume that intrinsics are always ABI compatible. | |||
| 6080 | if (NestedCallee->isIntrinsic()) | |||
| 6081 | continue; | |||
| 6082 | ||||
| 6083 | // Do a precise compatibility check. | |||
| 6084 | if (!areTypesABICompatible(Caller, NestedCallee, Types)) | |||
| 6085 | return false; | |||
| 6086 | } else { | |||
| 6087 | // We don't know the target features of the callee, | |||
| 6088 | // assume it is incompatible. | |||
| 6089 | return false; | |||
| 6090 | } | |||
| 6091 | } | |||
| 6092 | } | |||
| 6093 | return true; | |||
| 6094 | } | |||
| 6095 | ||||
| 6096 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, | |||
| 6097 | const Function *Callee, | |||
| 6098 | const ArrayRef<Type *> &Types) const { | |||
| 6099 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) | |||
| 6100 | return false; | |||
| 6101 | ||||
| 6102 | // If we get here, we know the target features match. If one function | |||
| 6103 | // considers 512-bit vectors legal and the other does not, consider them | |||
| 6104 | // incompatible. | |||
| 6105 | const TargetMachine &TM = getTLI()->getTargetMachine(); | |||
| 6106 | ||||
| 6107 | if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() == | |||
| 6108 | TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs()) | |||
| 6109 | return true; | |||
| 6110 | ||||
| 6111 | // Consider the arguments compatible if they aren't vectors or aggregates. | |||
| 6112 | // FIXME: Look at the size of vectors. | |||
| 6113 | // FIXME: Look at the element types of aggregates to see if there are vectors. | |||
| 6114 | return llvm::none_of(Types, | |||
| 6115 | [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); | |||
| 6116 | } | |||
| 6117 | ||||
| 6118 | X86TTIImpl::TTI::MemCmpExpansionOptions | |||
| 6119 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { | |||
| 6120 | TTI::MemCmpExpansionOptions Options; | |||
| 6121 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); | |||
| 6122 | Options.NumLoadsPerBlock = 2; | |||
| 6123 | // All GPR and vector loads can be unaligned. | |||
| 6124 | Options.AllowOverlappingLoads = true; | |||
| 6125 | if (IsZeroCmp) { | |||
| 6126 | // Only enable vector loads for equality comparison. Right now the vector | |||
| 6127 | // version is not as fast for three way compare (see #33329). | |||
| 6128 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); | |||
| 6129 | if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); | |||
| 6130 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); | |||
| 6131 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); | |||
| 6132 | } | |||
| 6133 | if (ST->is64Bit()) { | |||
| 6134 | Options.LoadSizes.push_back(8); | |||
| 6135 | } | |||
| 6136 | Options.LoadSizes.push_back(4); | |||
| 6137 | Options.LoadSizes.push_back(2); | |||
| 6138 | Options.LoadSizes.push_back(1); | |||
| 6139 | return Options; | |||
| 6140 | } | |||
| 6141 | ||||
| 6142 | bool X86TTIImpl::prefersVectorizedAddressing() const { | |||
| 6143 | return supportsGather(); | |||
| 6144 | } | |||
| 6145 | ||||
| 6146 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { | |||
| 6147 | return false; | |||
| 6148 | } | |||
| 6149 | ||||
| 6150 | bool X86TTIImpl::enableInterleavedAccessVectorization() { | |||
| 6151 | // TODO: We expect this to be beneficial regardless of arch, | |||
| 6152 | // but there are currently some unexplained performance artifacts on Atom. | |||
| 6153 | // As a temporary solution, disable on Atom. | |||
| 6154 | return !(ST->isAtom()); | |||
| 6155 | } | |||
| 6156 | ||||
| 6157 | // Get estimation for interleaved load/store operations and strided load. | |||
| 6158 | // \p Indices contains indices for strided load. | |||
| 6159 | // \p Factor - the factor of interleaving. | |||
| 6160 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. | |||
| 6161 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( | |||
| 6162 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, | |||
| 6163 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, | |||
| 6164 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { | |||
| 6165 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
| 6166 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
| 6167 | // VecTy = <12 x i32>. | |||
| 6168 | ||||
| 6169 | // Calculate the number of memory operations (NumOfMemOps), required | |||
| 6170 | // for load/store the VecTy. | |||
| 6171 | MVT LegalVT = getTypeLegalizationCost(VecTy).second; | |||
| 6172 | unsigned VecTySize = DL.getTypeStoreSize(VecTy); | |||
| 6173 | unsigned LegalVTSize = LegalVT.getStoreSize(); | |||
| 6174 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; | |||
| 6175 | ||||
| 6176 | // Get the cost of one memory operation. | |||
| 6177 | auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), | |||
| 6178 | LegalVT.getVectorNumElements()); | |||
| 6179 | InstructionCost MemOpCost; | |||
| 6180 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; | |||
| 6181 | if (UseMaskedMemOp) | |||
| 6182 | MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, | |||
| 6183 | AddressSpace, CostKind); | |||
| 6184 | else | |||
| 6185 | MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, MaybeAlign(Alignment), | |||
| 6186 | AddressSpace, CostKind); | |||
| 6187 | ||||
| 6188 | unsigned VF = VecTy->getNumElements() / Factor; | |||
| 6189 | MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); | |||
| 6190 | ||||
| 6191 | InstructionCost MaskCost; | |||
| 6192 | if (UseMaskedMemOp) { | |||
| 6193 | APInt DemandedLoadStoreElts = APInt::getZero(VecTy->getNumElements()); | |||
| 6194 | for (unsigned Index : Indices) { | |||
| 6195 | assert(Index < Factor && "Invalid index for interleaved memory op")(static_cast <bool> (Index < Factor && "Invalid index for interleaved memory op" ) ? void (0) : __assert_fail ("Index < Factor && \"Invalid index for interleaved memory op\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6195, __extension__ __PRETTY_FUNCTION__)); | |||
| 6196 | for (unsigned Elm = 0; Elm < VF; Elm++) | |||
| 6197 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); | |||
| 6198 | } | |||
| 6199 | ||||
| 6200 | Type *I1Type = Type::getInt1Ty(VecTy->getContext()); | |||
| 6201 | ||||
| 6202 | MaskCost = getReplicationShuffleCost( | |||
| 6203 | I1Type, Factor, VF, | |||
| 6204 | UseMaskForGaps ? DemandedLoadStoreElts | |||
| 6205 | : APInt::getAllOnes(VecTy->getNumElements()), | |||
| 6206 | CostKind); | |||
| 6207 | ||||
| 6208 | // The Gaps mask is invariant and created outside the loop, therefore the | |||
| 6209 | // cost of creating it is not accounted for here. However if we have both | |||
| 6210 | // a MaskForGaps and some other mask that guards the execution of the | |||
| 6211 | // memory access, we need to account for the cost of And-ing the two masks | |||
| 6212 | // inside the loop. | |||
| 6213 | if (UseMaskForGaps) { | |||
| 6214 | auto *MaskVT = FixedVectorType::get(I1Type, VecTy->getNumElements()); | |||
| 6215 | MaskCost += getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); | |||
| 6216 | } | |||
| 6217 | } | |||
| 6218 | ||||
| 6219 | if (Opcode == Instruction::Load) { | |||
| 6220 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) | |||
| 6221 | // contain the cost of the optimized shuffle sequence that the | |||
| 6222 | // X86InterleavedAccess pass will generate. | |||
| 6223 | // The cost of loads and stores are computed separately from the table. | |||
| 6224 | ||||
| 6225 | // X86InterleavedAccess support only the following interleaved-access group. | |||
| 6226 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { | |||
| 6227 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 | |||
| 6228 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
| 6229 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 | |||
| 6230 | }; | |||
| 6231 | ||||
| 6232 | if (const auto *Entry = | |||
| 6233 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) | |||
| 6234 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | |||
| 6235 | //If an entry does not exist, fallback to the default implementation. | |||
| 6236 | ||||
| 6237 | // Kind of shuffle depends on number of loaded values. | |||
| 6238 | // If we load the entire data in one register, we can use a 1-src shuffle. | |||
| 6239 | // Otherwise, we'll merge 2 sources in each operation. | |||
| 6240 | TTI::ShuffleKind ShuffleKind = | |||
| 6241 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; | |||
| 6242 | ||||
| 6243 | InstructionCost ShuffleCost = getShuffleCost( | |||
| 6244 | ShuffleKind, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); | |||
| 6245 | ||||
| 6246 | unsigned NumOfLoadsInInterleaveGrp = | |||
| 6247 | Indices.size() ? Indices.size() : Factor; | |||
| 6248 | auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), | |||
| 6249 | VecTy->getNumElements() / Factor); | |||
| 6250 | InstructionCost NumOfResults = | |||
| 6251 | getTypeLegalizationCost(ResultTy).first * NumOfLoadsInInterleaveGrp; | |||
| 6252 | ||||
| 6253 | // About a half of the loads may be folded in shuffles when we have only | |||
| 6254 | // one result. If we have more than one result, or the loads are masked, | |||
| 6255 | // we do not fold loads at all. | |||
| 6256 | unsigned NumOfUnfoldedLoads = | |||
| 6257 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; | |||
| 6258 | ||||
| 6259 | // Get a number of shuffle operations per result. | |||
| 6260 | unsigned NumOfShufflesPerResult = | |||
| 6261 | std::max((unsigned)1, (unsigned)(NumOfMemOps - 1)); | |||
| 6262 | ||||
| 6263 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
| 6264 | // When we have more than one destination, we need additional instructions | |||
| 6265 | // to keep sources. | |||
| 6266 | InstructionCost NumOfMoves = 0; | |||
| 6267 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) | |||
| 6268 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; | |||
| 6269 | ||||
| 6270 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + | |||
| 6271 | MaskCost + NumOfUnfoldedLoads * MemOpCost + | |||
| 6272 | NumOfMoves; | |||
| 6273 | ||||
| 6274 | return Cost; | |||
| 6275 | } | |||
| 6276 | ||||
| 6277 | // Store. | |||
| 6278 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6279, __extension__ __PRETTY_FUNCTION__)) | |||
| 6279 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6279, __extension__ __PRETTY_FUNCTION__)); | |||
| 6280 | // X86InterleavedAccess support only the following interleaved-access group. | |||
| 6281 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { | |||
| 6282 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) | |||
| 6283 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) | |||
| 6284 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) | |||
| 6285 | ||||
| 6286 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) | |||
| 6287 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) | |||
| 6288 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) | |||
| 6289 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) | |||
| 6290 | }; | |||
| 6291 | ||||
| 6292 | if (const auto *Entry = | |||
| 6293 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) | |||
| 6294 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; | |||
| 6295 | //If an entry does not exist, fallback to the default implementation. | |||
| 6296 | ||||
| 6297 | // There is no strided stores meanwhile. And store can't be folded in | |||
| 6298 | // shuffle. | |||
| 6299 | unsigned NumOfSources = Factor; // The number of values to be merged. | |||
| 6300 | InstructionCost ShuffleCost = getShuffleCost( | |||
| 6301 | TTI::SK_PermuteTwoSrc, SingleMemOpTy, std::nullopt, CostKind, 0, nullptr); | |||
| 6302 | unsigned NumOfShufflesPerStore = NumOfSources - 1; | |||
| 6303 | ||||
| 6304 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. | |||
| 6305 | // We need additional instructions to keep sources. | |||
| 6306 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; | |||
| 6307 | InstructionCost Cost = | |||
| 6308 | MaskCost + | |||
| 6309 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + | |||
| 6310 | NumOfMoves; | |||
| 6311 | return Cost; | |||
| 6312 | } | |||
| 6313 | ||||
| 6314 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( | |||
| 6315 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, | |||
| 6316 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, | |||
| 6317 | bool UseMaskForCond, bool UseMaskForGaps) { | |||
| 6318 | auto *VecTy = cast<FixedVectorType>(BaseTy); | |||
| 6319 | ||||
| 6320 | auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { | |||
| 6321 | Type *EltTy = cast<VectorType>(VecTy)->getElementType(); | |||
| 6322 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || | |||
| 6323 | EltTy->isIntegerTy(32) || EltTy->isPointerTy()) | |||
| 6324 | return true; | |||
| 6325 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) | |||
| 6326 | return HasBW; | |||
| 6327 | return false; | |||
| 6328 | }; | |||
| 6329 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) | |||
| 6330 | return getInterleavedMemoryOpCostAVX512( | |||
| 6331 | Opcode, VecTy, Factor, Indices, Alignment, | |||
| 6332 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); | |||
| 6333 | ||||
| 6334 | if (UseMaskForCond || UseMaskForGaps) | |||
| 6335 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
| 6336 | Alignment, AddressSpace, CostKind, | |||
| 6337 | UseMaskForCond, UseMaskForGaps); | |||
| 6338 | ||||
| 6339 | // Get estimation for interleaved load/store operations for SSE-AVX2. | |||
| 6340 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow | |||
| 6341 | // computing the cost using a generic formula as a function of generic | |||
| 6342 | // shuffles. We therefore use a lookup table instead, filled according to | |||
| 6343 | // the instruction sequences that codegen currently generates. | |||
| 6344 | ||||
| 6345 | // VecTy for interleave memop is <VF*Factor x Elt>. | |||
| 6346 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have | |||
| 6347 | // VecTy = <12 x i32>. | |||
| 6348 | MVT LegalVT = getTypeLegalizationCost(VecTy).second; | |||
| 6349 | ||||
| 6350 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case | |||
| 6351 | // the VF=2, while v2i128 is an unsupported MVT vector type | |||
| 6352 | // (see MachineValueType.h::getVectorVT()). | |||
| 6353 | if (!LegalVT.isVector()) | |||
| 6354 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
| 6355 | Alignment, AddressSpace, CostKind); | |||
| 6356 | ||||
| 6357 | unsigned VF = VecTy->getNumElements() / Factor; | |||
| 6358 | Type *ScalarTy = VecTy->getElementType(); | |||
| 6359 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. | |||
| 6360 | if (!ScalarTy->isIntegerTy()) | |||
| 6361 | ScalarTy = | |||
| 6362 | Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); | |||
| 6363 | ||||
| 6364 | // Get the cost of all the memory operations. | |||
| 6365 | // FIXME: discount dead loads. | |||
| 6366 | InstructionCost MemOpCosts = getMemoryOpCost( | |||
| 6367 | Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); | |||
| 6368 | ||||
| 6369 | auto *VT = FixedVectorType::get(ScalarTy, VF); | |||
| 6370 | EVT ETy = TLI->getValueType(DL, VT); | |||
| 6371 | if (!ETy.isSimple()) | |||
| 6372 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
| 6373 | Alignment, AddressSpace, CostKind); | |||
| 6374 | ||||
| 6375 | // TODO: Complete for other data-types and strides. | |||
| 6376 | // Each combination of Stride, element bit width and VF results in a different | |||
| 6377 | // sequence; The cost tables are therefore accessed with: | |||
| 6378 | // Factor (stride) and VectorType=VFxiN. | |||
| 6379 | // The Cost accounts only for the shuffle sequence; | |||
| 6380 | // The cost of the loads/stores is accounted for separately. | |||
| 6381 | // | |||
| 6382 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { | |||
| 6383 | {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 | |||
| 6384 | {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 | |||
| 6385 | {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 | |||
| 6386 | {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 | |||
| 6387 | {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 | |||
| 6388 | ||||
| 6389 | {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 | |||
| 6390 | {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 | |||
| 6391 | {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 | |||
| 6392 | ||||
| 6393 | {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 | |||
| 6394 | {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 | |||
| 6395 | {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 | |||
| 6396 | ||||
| 6397 | {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 | |||
| 6398 | {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 | |||
| 6399 | {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 | |||
| 6400 | {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 | |||
| 6401 | ||||
| 6402 | {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 | |||
| 6403 | {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 | |||
| 6404 | {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 | |||
| 6405 | {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 | |||
| 6406 | {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 | |||
| 6407 | ||||
| 6408 | {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 | |||
| 6409 | {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 | |||
| 6410 | {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 | |||
| 6411 | {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 | |||
| 6412 | {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 | |||
| 6413 | ||||
| 6414 | {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 | |||
| 6415 | {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 | |||
| 6416 | {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 | |||
| 6417 | {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 | |||
| 6418 | {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 | |||
| 6419 | ||||
| 6420 | {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 | |||
| 6421 | {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 | |||
| 6422 | {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 | |||
| 6423 | {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 | |||
| 6424 | ||||
| 6425 | {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 | |||
| 6426 | {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 | |||
| 6427 | {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 | |||
| 6428 | {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 | |||
| 6429 | {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 | |||
| 6430 | ||||
| 6431 | {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 | |||
| 6432 | {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 | |||
| 6433 | {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 | |||
| 6434 | {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 | |||
| 6435 | {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 | |||
| 6436 | ||||
| 6437 | {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 | |||
| 6438 | {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 | |||
| 6439 | {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 | |||
| 6440 | {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 | |||
| 6441 | {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 | |||
| 6442 | ||||
| 6443 | {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 | |||
| 6444 | {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 | |||
| 6445 | {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 | |||
| 6446 | {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 | |||
| 6447 | ||||
| 6448 | {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 | |||
| 6449 | {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 | |||
| 6450 | {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 | |||
| 6451 | {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 | |||
| 6452 | {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 | |||
| 6453 | ||||
| 6454 | {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 | |||
| 6455 | {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 | |||
| 6456 | {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 | |||
| 6457 | {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 | |||
| 6458 | {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 | |||
| 6459 | ||||
| 6460 | {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 | |||
| 6461 | {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 | |||
| 6462 | {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 | |||
| 6463 | {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 | |||
| 6464 | ||||
| 6465 | {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 | |||
| 6466 | {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 | |||
| 6467 | {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 | |||
| 6468 | ||||
| 6469 | {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 | |||
| 6470 | }; | |||
| 6471 | ||||
| 6472 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { | |||
| 6473 | {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 | |||
| 6474 | }; | |||
| 6475 | ||||
| 6476 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { | |||
| 6477 | {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 | |||
| 6478 | {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 | |||
| 6479 | ||||
| 6480 | {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 | |||
| 6481 | {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 | |||
| 6482 | ||||
| 6483 | {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 | |||
| 6484 | }; | |||
| 6485 | ||||
| 6486 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { | |||
| 6487 | {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) | |||
| 6488 | {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) | |||
| 6489 | ||||
| 6490 | {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) | |||
| 6491 | {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) | |||
| 6492 | {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) | |||
| 6493 | ||||
| 6494 | {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) | |||
| 6495 | {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) | |||
| 6496 | {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) | |||
| 6497 | {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) | |||
| 6498 | ||||
| 6499 | {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) | |||
| 6500 | {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) | |||
| 6501 | {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) | |||
| 6502 | {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) | |||
| 6503 | {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) | |||
| 6504 | ||||
| 6505 | {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) | |||
| 6506 | {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) | |||
| 6507 | {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) | |||
| 6508 | {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) | |||
| 6509 | {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) | |||
| 6510 | ||||
| 6511 | {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) | |||
| 6512 | {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) | |||
| 6513 | {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) | |||
| 6514 | {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) | |||
| 6515 | {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) | |||
| 6516 | ||||
| 6517 | {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) | |||
| 6518 | {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) | |||
| 6519 | {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) | |||
| 6520 | {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) | |||
| 6521 | {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) | |||
| 6522 | ||||
| 6523 | {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) | |||
| 6524 | {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) | |||
| 6525 | {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) | |||
| 6526 | {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) | |||
| 6527 | ||||
| 6528 | {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) | |||
| 6529 | {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) | |||
| 6530 | {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) | |||
| 6531 | {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) | |||
| 6532 | {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) | |||
| 6533 | ||||
| 6534 | {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) | |||
| 6535 | {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) | |||
| 6536 | {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) | |||
| 6537 | {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) | |||
| 6538 | {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) | |||
| 6539 | ||||
| 6540 | {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) | |||
| 6541 | {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) | |||
| 6542 | {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) | |||
| 6543 | {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) | |||
| 6544 | {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) | |||
| 6545 | ||||
| 6546 | {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) | |||
| 6547 | {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) | |||
| 6548 | {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) | |||
| 6549 | {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) | |||
| 6550 | ||||
| 6551 | {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) | |||
| 6552 | {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) | |||
| 6553 | {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) | |||
| 6554 | {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) | |||
| 6555 | {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) | |||
| 6556 | ||||
| 6557 | {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) | |||
| 6558 | {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) | |||
| 6559 | {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) | |||
| 6560 | {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) | |||
| 6561 | {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) | |||
| 6562 | ||||
| 6563 | {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) | |||
| 6564 | {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) | |||
| 6565 | {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) | |||
| 6566 | {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) | |||
| 6567 | ||||
| 6568 | {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) | |||
| 6569 | {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) | |||
| 6570 | {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) | |||
| 6571 | }; | |||
| 6572 | ||||
| 6573 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { | |||
| 6574 | {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) | |||
| 6575 | {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) | |||
| 6576 | {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) | |||
| 6577 | ||||
| 6578 | {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) | |||
| 6579 | {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) | |||
| 6580 | ||||
| 6581 | {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) | |||
| 6582 | }; | |||
| 6583 | ||||
| 6584 | if (Opcode == Instruction::Load) { | |||
| 6585 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), | |||
| 6586 | MemOpCosts](const CostTblEntry *Entry) { | |||
| 6587 | // NOTE: this is just an approximation! | |||
| 6588 | // It can over/under -estimate the cost! | |||
| 6589 | return MemOpCosts + divideCeil(NumMembers * Entry->Cost, Factor); | |||
| 6590 | }; | |||
| 6591 | ||||
| 6592 | if (ST->hasAVX2()) | |||
| 6593 | if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, | |||
| 6594 | ETy.getSimpleVT())) | |||
| 6595 | return GetDiscountedCost(Entry); | |||
| 6596 | ||||
| 6597 | if (ST->hasSSSE3()) | |||
| 6598 | if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, | |||
| 6599 | ETy.getSimpleVT())) | |||
| 6600 | return GetDiscountedCost(Entry); | |||
| 6601 | ||||
| 6602 | if (ST->hasSSE2()) | |||
| 6603 | if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, | |||
| 6604 | ETy.getSimpleVT())) | |||
| 6605 | return GetDiscountedCost(Entry); | |||
| 6606 | } else { | |||
| 6607 | assert(Opcode == Instruction::Store &&(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6608, __extension__ __PRETTY_FUNCTION__)) | |||
| 6608 | "Expected Store Instruction at this point")(static_cast <bool> (Opcode == Instruction::Store && "Expected Store Instruction at this point") ? void (0) : __assert_fail ("Opcode == Instruction::Store && \"Expected Store Instruction at this point\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6608, __extension__ __PRETTY_FUNCTION__)); | |||
| 6609 | assert((!Indices.size() || Indices.size() == Factor) &&(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6610, __extension__ __PRETTY_FUNCTION__)) | |||
| 6610 | "Interleaved store only supports fully-interleaved groups.")(static_cast <bool> ((!Indices.size() || Indices.size() == Factor) && "Interleaved store only supports fully-interleaved groups." ) ? void (0) : __assert_fail ("(!Indices.size() || Indices.size() == Factor) && \"Interleaved store only supports fully-interleaved groups.\"" , "llvm/lib/Target/X86/X86TargetTransformInfo.cpp", 6610, __extension__ __PRETTY_FUNCTION__)); | |||
| 6611 | if (ST->hasAVX2()) | |||
| 6612 | if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, | |||
| 6613 | ETy.getSimpleVT())) | |||
| 6614 | return MemOpCosts + Entry->Cost; | |||
| 6615 | ||||
| 6616 | if (ST->hasSSE2()) | |||
| 6617 | if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, | |||
| 6618 | ETy.getSimpleVT())) | |||
| 6619 | return MemOpCosts + Entry->Cost; | |||
| 6620 | } | |||
| 6621 | ||||
| 6622 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, | |||
| 6623 | Alignment, AddressSpace, CostKind, | |||
| 6624 | UseMaskForCond, UseMaskForGaps); | |||
| 6625 | } | |||
| 6626 | ||||
| 6627 | InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, | |||
| 6628 | int64_t BaseOffset, | |||
| 6629 | bool HasBaseReg, int64_t Scale, | |||
| 6630 | unsigned AddrSpace) const { | |||
| 6631 | // Scaling factors are not free at all. | |||
| 6632 | // An indexed folded instruction, i.e., inst (reg1, reg2, scale), | |||
| 6633 | // will take 2 allocations in the out of order engine instead of 1 | |||
| 6634 | // for plain addressing mode, i.e. inst (reg1). | |||
| 6635 | // E.g., | |||
| 6636 | // vaddps (%rsi,%rdx), %ymm0, %ymm1 | |||
| 6637 | // Requires two allocations (one for the load, one for the computation) | |||
| 6638 | // whereas: | |||
| 6639 | // vaddps (%rsi), %ymm0, %ymm1 | |||
| 6640 | // Requires just 1 allocation, i.e., freeing allocations for other operations | |||
| 6641 | // and having less micro operations to execute. | |||
| 6642 | // | |||
| 6643 | // For some X86 architectures, this is even worse because for instance for | |||
| 6644 | // stores, the complex addressing mode forces the instruction to use the | |||
| 6645 | // "load" ports instead of the dedicated "store" port. | |||
| 6646 | // E.g., on Haswell: | |||
| 6647 | // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. | |||
| 6648 | // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. | |||
| 6649 | TargetLoweringBase::AddrMode AM; | |||
| 6650 | AM.BaseGV = BaseGV; | |||
| 6651 | AM.BaseOffs = BaseOffset; | |||
| 6652 | AM.HasBaseReg = HasBaseReg; | |||
| 6653 | AM.Scale = Scale; | |||
| 6654 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) | |||
| 6655 | // Scale represents reg2 * scale, thus account for 1 | |||
| 6656 | // as soon as we use a second register. | |||
| 6657 | return AM.Scale != 0; | |||
| 6658 | return -1; | |||
| 6659 | } |